[Bf-blender-cvs] [f64fe321fa6] cycles-x: Merge branch 'master' into cycles-x

Tue Jul 27 12:13:55 CEST 2021

Commit: f64fe321fa65f7c36ebf7104b84868d1bb65a2fc
Author: Sergey Sharybin
Date:   Tue Jul 27 12:13:46 2021 +0200
Branches: cycles-x
https://developer.blender.org/rBf64fe321fa65f7c36ebf7104b84868d1bb65a2fc

Merge branch 'master' into cycles-x

===================================================================



===================================================================

diff --cc intern/cycles/device/cuda/device_impl.cpp
index 0eaf787dbd7,00000000000..37fab8f8293
mode 100644,000000..100644

--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@@ -1,1370 -1,0 +1,1370 @@@
 +/*
 + * Copyright 2011-2013 Blender Foundation
 + *
 + * Licensed under the Apache License, Version 2.0 (the "License");
 + * you may not use this file except in compliance with the License.
 + * You may obtain a copy of the License at
 + *
 + * http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +#ifdef WITH_CUDA
 +
 +#  include <climits>
 +#  include <limits.h>
 +#  include <stdio.h>
 +#  include <stdlib.h>
 +#  include <string.h>
 +
 +#  include "device/cuda/device_impl.h"
 +
 +#  include "render/buffers.h"
 +
 +#  include "util/util_debug.h"
 +#  include "util/util_foreach.h"
 +#  include "util/util_logging.h"
 +#  include "util/util_map.h"
 +#  include "util/util_md5.h"
 +#  include "util/util_opengl.h"
 +#  include "util/util_path.h"
 +#  include "util/util_string.h"
 +#  include "util/util_system.h"
 +#  include "util/util_time.h"
 +#  include "util/util_types.h"
 +#  include "util/util_windows.h"
 +
 +CCL_NAMESPACE_BEGIN
 +
 +class CUDADevice;
 +
 +bool CUDADevice::have_precompiled_kernels()
 +{
 +  string cubins_path = path_get("lib");
 +  return path_exists(cubins_path);
 +}
 +
 +bool CUDADevice::show_samples() const
 +{
 +  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
 +  return true;
 +}
 +
 +BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
 +{
 +  return BVH_LAYOUT_BVH2;
 +}
 +
 +void CUDADevice::set_error(const string &error)
 +{
 +  Device::set_error(error);
 +
 +  if (first_error) {
 +    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
 +    fprintf(stderr,
 +            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
 +    first_error = false;
 +  }
 +}
 +
 +CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 +    : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL)
 +{
 +  first_error = true;
 +
 +  cuDevId = info.num;
 +  cuDevice = 0;
 +  cuContext = 0;
 +
 +  cuModule = 0;
 +
 +  need_texture_info = false;
 +
 +  device_texture_headroom = 0;
 +  device_working_headroom = 0;
 +  move_texture_to_host = false;
 +  map_host_limit = 0;
 +  map_host_used = 0;
 +  can_map_host = 0;
 +  pitch_alignment = 0;
 +
 +  /* Initialize CUDA. */
 +  CUresult result = cuInit(0);
 +  if (result != CUDA_SUCCESS) {
 +    set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
 +    return;
 +  }
 +
 +  /* Setup device and context. */
 +  result = cuDeviceGet(&cuDevice, cuDevId);
 +  if (result != CUDA_SUCCESS) {
 +    set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
 +                            cuewErrorString(result)));
 +    return;
 +  }
 +
 +  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
 +   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
 +   * so we can predict which memory to map to host. */
 +  cuda_assert(
 +      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
 +
 +  cuda_assert(cuDeviceGetAttribute(
 +      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
 +
 +  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
 +  if (can_map_host) {
 +    ctx_flags |= CU_CTX_MAP_HOST;
 +    init_host_memory();
 +  }
 +
 +  /* Create context. */
 +  result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
 +
 +  if (result != CUDA_SUCCESS) {
 +    set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
 +    return;
 +  }
 +
 +  int major, minor;
 +  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
 +  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
 +  cuDevArchitecture = major * 100 + minor * 10;
 +
 +  /* Pop context set by cuCtxCreate. */
 +  cuCtxPopCurrent(NULL);
 +}
 +
 +CUDADevice::~CUDADevice()
 +{
 +  texture_info.free();
 +
 +  cuda_assert(cuCtxDestroy(cuContext));
 +}
 +
 +bool CUDADevice::support_device(const uint /*kernel_features*/)
 +{
 +  int major, minor;
 +  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
 +  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
 +
 +  /* We only support sm_30 and above */
 +  if (major < 3) {
 +    set_error(string_printf(
 +        "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
 +    return false;
 +  }
 +
 +  return true;
 +}
 +
 +bool CUDADevice::check_peer_access(Device *peer_device)
 +{
 +  if (peer_device == this) {
 +    return false;
 +  }
 +  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
 +    return false;
 +  }
 +
 +  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
 +
 +  int can_access = 0;
 +  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
 +  if (can_access == 0) {
 +    return false;
 +  }
 +
 +  // Ensure array access over the link is possible as well (for 3D textures)
 +  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
 +                                      CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
 +                                      cuDevice,
 +                                      peer_device_cuda->cuDevice));
 +  if (can_access == 0) {
 +    return false;
 +  }
 +
 +  // Enable peer access in both directions
 +  {
 +    const CUDAContextScope scope(this);
 +    CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
 +    if (result != CUDA_SUCCESS) {
 +      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
 +                              cuewErrorString(result)));
 +      return false;
 +    }
 +  }
 +  {
 +    const CUDAContextScope scope(peer_device_cuda);
 +    CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
 +    if (result != CUDA_SUCCESS) {
 +      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
 +                              cuewErrorString(result)));
 +      return false;
 +    }
 +  }
 +
 +  return true;
 +}
 +
 +bool CUDADevice::use_adaptive_compilation()
 +{
 +  return DebugFlags().cuda.adaptive_compile;
 +}
 +
 +/* Common NVCC flags which stays the same regardless of shading model,
 + * kernel sources md5 and only depends on compiler or compilation settings.
 + */
 +string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
 +{
 +  const int machine = system_cpu_bits();
 +  const string source_path = path_get("source");
 +  const string include_path = source_path;
 +  string cflags = string_printf(
 +      "-m%d "
 +      "--ptxas-options=\"-v\" "
 +      "--use_fast_math "
 +      "-DNVCC "
 +      "-I\"%s\"",
 +      machine,
 +      include_path.c_str());
 +  if (use_adaptive_compilation()) {
 +    cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
 +  }
 +  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
 +  if (extra_cflags) {
 +    cflags += string(" ") + string(extra_cflags);
 +  }
 +
 +#  ifdef WITH_NANOVDB
 +  cflags += " -DWITH_NANOVDB";
 +#  endif
 +
 +  return cflags;
 +}
 +
 +string CUDADevice::compile_kernel(const uint kernel_features,
 +                                  const char *name,
 +                                  const char *base,
 +                                  bool force_ptx)
 +{
 +  /* Compute kernel name. */
 +  int major, minor;
 +  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
 +  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
 +
 +  /* Attempt to use kernel provided with Blender. */
 +  if (!use_adaptive_compilation()) {
 +    if (!force_ptx) {
 +      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
 +      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
 +      if (path_exists(cubin)) {
 +        VLOG(1) << "Using precompiled kernel.";
 +        return cubin;
 +      }
 +    }
 +
 +    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
 +    int ptx_major = major, ptx_minor = minor;
 +    while (ptx_major >= 3) {
 +      const string ptx = path_get(
 +          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
 +      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
 +      if (path_exists(ptx)) {
 +        VLOG(1) << "Using precompiled kernel.";
 +        return ptx;
 +      }
 +
 +      if (ptx_minor > 0) {
 +        ptx_minor--;
 +      }
 +      else {
 +        ptx_major--;
 +        ptx_minor = 9;
 +      }
 +    }
 +  }
 +
 +  /* Try to use locally compiled kernel. */
 +  string source_path = path_get("source");
 +  const string source_md5 = path_files_md5_hash(source_path);
 +
 +  /* We include cflags into md5 so changing cuda toolkit or changing other
 +   * compiler command line arguments makes sure cubin gets re-built.
 +   */
 +  string common_cflags = compile_kernel_get_common_cflags(kernel_features);
 +  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
 +
 +  const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
 +  const char *const kernel_arch = force_ptx ? "compute" : "sm";
 +  const string cubin_file = string_printf(
 +      "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
 +  const string cubin = path_cache_get(path_join("kernels", cubin_file));
 +  VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
 +  if (path_

@@ Diff output truncated at 10240 characters. @@