| /* Copyright 2021 Google LLC. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #ifndef RUY_RUY_TRACE_H_ |
| #define RUY_RUY_TRACE_H_ |
| |
| #ifdef RUY_TRACE |
| |
| #include <algorithm> |
| #include <cstdio> |
| #include <cstdlib> |
| #include <memory> |
| #include <mutex> |
| #include <string> |
| #include <thread> |
| #include <vector> |
| |
| #include "ruy/mat.h" |
| #include "ruy/matrix.h" |
| #include "ruy/path.h" |
| #include "ruy/platform.h" |
| #include "ruy/side_pair.h" |
| |
| namespace ruy { |
| |
| // Helper for `formatted` so we don't have to put .c_str() on strings. |
| template <typename T> |
| T value_for_snprintf(T value) { |
| return value; |
| } |
| |
| inline const char* value_for_snprintf(const std::string& s) { |
| return s.c_str(); |
| } |
| |
| // A sprintf-like function returning a std::string. |
| // Remove this once we can rely on std::format (c++20). |
| template <typename... Args> |
| std::string formatted(const char* format, Args... args) { |
| char buf[1024]; |
| #pragma GCC diagnostic push |
| #pragma GCC diagnostic warning "-Wformat-security" |
| int size = snprintf(buf, sizeof buf, format, value_for_snprintf(args)...); |
| #pragma GCC diagnostic pop |
| if (size <= 0) { |
| abort(); |
| } |
| return std::string(buf); |
| } |
| |
| // An entry in the trace. |
| struct ThreadTraceEntry final { |
| std::string text; |
| int indent = 0; |
| const char* source_file = nullptr; |
| int source_line = 0; |
| }; |
| |
| // Trace for one thread. |
| class ThreadTrace final { |
| public: |
| ~ThreadTrace() {} |
| |
| void set_thread_id(int thread_id) { thread_id_ = thread_id; } |
| int thread_id() const { return thread_id_; } |
| |
| bool is_in_run_ahead_packing_loop() const { |
| return is_in_run_ahead_packing_loop_; |
| } |
| void set_is_in_run_ahead_packing_loop(bool value) { |
| is_in_run_ahead_packing_loop_ = value; |
| } |
| |
| void set_current_source_file(const char* source_file) { |
| current_source_file_ = source_file; |
| } |
| |
| void set_current_source_line(int source_line) { |
| current_source_line_ = source_line; |
| } |
| |
| const std::vector<ThreadTraceEntry>& entries() const { return entries_; } |
| |
| template <typename... Args> |
| void Write(const char* format, Args... args) { |
| ThreadTraceEntry entry; |
| entry.text = formatted(format, args...); |
| entry.indent = indent_; |
| entry.source_file = current_source_file_; |
| entry.source_line = current_source_line_; |
| entries_.emplace_back(std::move(entry)); |
| } |
| |
| template <typename... Args> |
| void EnterScope(const char* scope_name) { |
| Write("%s {", scope_name); |
| indent_++; |
| } |
| void LeaveScope(const char* scope_name) { |
| indent_--; |
| Write("} // end of %s", scope_name); |
| } |
| |
| private: |
| // The trace contents |
| std::vector<ThreadTraceEntry> entries_; |
| |
| // Current indentation level. |
| int indent_ = 0; |
| // Thread's ID as set by Ruy, e.g. [0,N-1]. Not OS TID. |
| int thread_id_ = -1; |
| // The run-ahead loop in `EnsurePacked` may run many iterations when the |
| // thread is waiting for a block to be packed by another thread --- it's |
| // a busy wait. We track whether we are in that mode to avoid generating |
| // many uninteresting trace entries. |
| bool is_in_run_ahead_packing_loop_ = false; |
| // Last recorded value of __FILE__ and __LINE__, as a convenience so we don't |
| // have to pass these in every call to `Write`. |
| const char* current_source_file_ = nullptr; |
| int current_source_line_ = 0; |
| }; |
| |
| // Main components of ruy. Used for trace colorization. |
| enum class Component { kNone, kFrontEnd, kMiddleEnd, kBackEnd, kThreadPool }; |
| |
| // Output format for the trace. |
| enum class TraceOutputFormat { kNone, kTerminal, kHtml }; |
| |
| inline std::string IndentString(int indent) { |
| std::string s; |
| for (int i = 0; i < indent; i++) { |
| s += " "; |
| } |
| return s; |
| } |
| |
| // Returns the text to write to the trace to open a colored section. |
| inline const char* ColorSectionStart(TraceOutputFormat output_format, |
| Component component) { |
| switch (output_format) { |
| case TraceOutputFormat::kTerminal: |
| switch (component) { |
| case Component::kFrontEnd: |
| return "\x1b[36m"; |
| case Component::kMiddleEnd: |
| return "\x1b[32m"; |
| case Component::kBackEnd: |
| return "\x1b[31m"; |
| case Component::kThreadPool: |
| return "\x1b[33m"; |
| default: |
| abort(); |
| return nullptr; |
| } |
| case TraceOutputFormat::kHtml: |
| switch (component) { |
| case Component::kFrontEnd: |
| return "<span style=\"background-color:#B2EBF2\">"; |
| case Component::kMiddleEnd: |
| return "<span style=\"background-color:#C8E6C9\">"; |
| case Component::kBackEnd: |
| return "<span style=\"background-color:#FFCDD2\">"; |
| case Component::kThreadPool: |
| return "<span style=\"background-color:#FFF9C4\">"; |
| default: |
| abort(); |
| return nullptr; |
| } |
| default: |
| abort(); |
| return nullptr; |
| } |
| } |
| |
| // Returns the text to write to the trace to close a colored section. |
| inline const char* ColorSectionEnd(TraceOutputFormat output_format) { |
| switch (output_format) { |
| case TraceOutputFormat::kTerminal: |
| return "\x1b[0m"; |
| case TraceOutputFormat::kHtml: |
| return "</span>"; |
| default: |
| abort(); |
| return nullptr; |
| } |
| } |
| |
| // Returns the output format to use for the trace. |
| inline TraceOutputFormat GetOutputFormat() { |
| const char* html_env = getenv("RUY_TRACE_HTML"); |
| if (html_env && strtol(html_env, nullptr, 10) != 0) { |
| return TraceOutputFormat::kHtml; |
| } else { |
| return TraceOutputFormat::kTerminal; |
| } |
| } |
| |
| // A `basename` function that's good enough for ruy __FILE__'s. |
| // Note: `basename` is POSIX-only and annoying (takes a char*, may mutate). |
| inline const char* GetBaseName(const char* path) { |
| std::size_t len = strlen(path); |
| if (len == 0) { |
| return path; |
| } |
| const char* ptr = path + len - 1; |
| while (ptr != path) { |
| if (*ptr == '/' || *ptr == '\\') { |
| return ptr + 1; |
| } |
| --ptr; |
| } |
| // Path did not contain any path separator. |
| return path; |
| } |
| |
| // Determines a Component (used for colorization) by source file. |
| inline Component GetComponentBySourceFile(const char* base_name) { |
| if (!strcmp(base_name, "pack.h") || !strcmp(base_name, "kernel.h")) { |
| return Component::kBackEnd; |
| } else if (!strcmp(base_name, "trmul.cc") || |
| !strcmp(base_name, "block_map.cc")) { |
| return Component::kMiddleEnd; |
| } else if (!strcmp(base_name, "thread_pool.cc")) { |
| return Component::kThreadPool; |
| } else { |
| return Component::kFrontEnd; |
| } |
| } |
| |
| inline std::string EscapeText(TraceOutputFormat output_format, |
| const std::string& text) { |
| if (output_format == TraceOutputFormat::kHtml) { |
| std::string escaped_text; |
| for (char c : text) { |
| if (c == '<') { |
| escaped_text += "<"; |
| } else if (c == '>') { |
| escaped_text += ">"; |
| } else { |
| escaped_text += c; |
| } |
| } |
| return escaped_text; |
| } else { |
| return text; |
| } |
| } |
| |
| // Prints an entry from the trace to the destination trace file. |
| inline void Print(const ThreadTraceEntry& entry, |
| TraceOutputFormat output_format, FILE* file) { |
| const char* base_name = GetBaseName(entry.source_file); |
| Component component = GetComponentBySourceFile(base_name); |
| const std::string& source_location = |
| formatted("%s:%d", base_name, entry.source_line); |
| const std::string& escaped_text = EscapeText(output_format, entry.text); |
| fprintf(file, "%s%-32s%s%s%s\n", ColorSectionStart(output_format, component), |
| source_location.c_str(), IndentString(entry.indent).c_str(), |
| escaped_text.c_str(), ColorSectionEnd(output_format)); |
| } |
| |
| // Prints a thread's entire trace to the destination trace file. |
| inline void Print(const ThreadTrace& trace, TraceOutputFormat output_format, |
| FILE* file) { |
| if (output_format == TraceOutputFormat::kHtml) { |
| fprintf(file, "<html><body><pre>\n<span style=\"font-weight:bold\">\n"); |
| } |
| fprintf(file, "Ruy trace for thread %d:\n", trace.thread_id()); |
| if (output_format == TraceOutputFormat::kHtml) { |
| fprintf(file, "</span>\n"); |
| } |
| for (const ThreadTraceEntry& entry : trace.entries()) { |
| Print(entry, output_format, file); |
| } |
| fprintf(file, "\n"); |
| if (output_format == TraceOutputFormat::kHtml) { |
| fprintf(file, "</pre></body></html>\n"); |
| } |
| } |
| |
| // Holds all the threads' traces. This is a global singleton class. |
| // On exit, when the singleton is destroyed, the destructor prints out the |
| // traces. |
| class AllThreadTraces final { |
| public: |
| // Add a new ThreadTrace for the current thread. Should be called only once |
| // on each thread. |
| ThreadTrace* AddCurrentThread() { |
| std::lock_guard<std::mutex> lock(mutex_); |
| ThreadTrace* thread_trace = new ThreadTrace; |
| thread_traces_.emplace_back(thread_trace); |
| return thread_trace; |
| } |
| ~AllThreadTraces() { |
| std::lock_guard<std::mutex> lock(mutex_); |
| // Open the destination file. |
| const char* file_env = getenv("RUY_TRACE_FILE"); |
| FILE* file = stdout; |
| if (file_env) { |
| file = fopen(file_env, "w"); |
| if (!file) { |
| fprintf(stderr, "Failed to open %s for write\n", file_env); |
| exit(1); |
| } |
| } |
| // Sort the threads by Ruy Thread ID (not OS TID). |
| auto output_format = GetOutputFormat(); |
| std::sort(std::begin(thread_traces_), std::end(thread_traces_), |
| [](const auto& a, const auto& b) { |
| return a->thread_id() < b->thread_id(); |
| }); |
| // Print all the threads' traces. |
| for (const auto& trace : thread_traces_) { |
| Print(*trace, output_format, file); |
| } |
| if (file_env) { |
| fclose(file); |
| } |
| } |
| static AllThreadTraces* Singleton() { |
| static AllThreadTraces all_thread_traces; |
| return &all_thread_traces; |
| } |
| |
| private: |
| std::vector<std::unique_ptr<ThreadTrace>> thread_traces_; |
| std::mutex mutex_; |
| }; |
| |
| // Returns the thread-local ThreadTrace singleton, constructing it as needed. |
| inline ThreadTrace* ThreadLocalTrace() { |
| static thread_local ThreadTrace* thread_local_trace = |
| AllThreadTraces::Singleton()->AddCurrentThread(); |
| return thread_local_trace; |
| } |
| |
| // RAII helper to trace a scope, e.g. a function scope. |
| class RuyTraceScope { |
| const char* source_file_; |
| int source_line_; |
| const char* scope_name_; |
| |
| public: |
| RuyTraceScope(const char* source_file, int source_line, |
| const char* scope_name) |
| : source_file_(source_file), |
| source_line_(source_line), |
| scope_name_(scope_name) { |
| ThreadLocalTrace()->set_current_source_file(source_file_); |
| ThreadLocalTrace()->set_current_source_line(source_line_); |
| ThreadLocalTrace()->EnterScope(scope_name_); |
| } |
| ~RuyTraceScope() { |
| ThreadLocalTrace()->set_current_source_file(source_file_); |
| ThreadLocalTrace()->set_current_source_line(source_line_); |
| ThreadLocalTrace()->LeaveScope(scope_name_); |
| } |
| }; |
| |
| #define RUY_TRACE_SCOPE_NAME_IMPL(file, line, name) \ |
| RuyTraceScope ruy_trace_scope##line(file, line, name) |
| #define RUY_TRACE_SCOPE_NAME(name) \ |
| RUY_TRACE_SCOPE_NAME_IMPL(__FILE__, __LINE__, name) |
| #define RUY_TRACE_SCOPE \ |
| RUY_TRACE_SCOPE_NAME_IMPL(__FILE__, __LINE__, __FUNCTION__) |
| |
| // Helpers to trace Ruy objects. |
| |
| inline std::string str(Order o) { |
| return o == Order::kRowMajor ? "row-major" : "column-major"; |
| } |
| |
| inline std::string str(Side s) { return s == Side::kLhs ? "LHS" : "RHS"; } |
| |
| inline std::string str(const Layout& layout) { |
| std::string s = |
| formatted("%dx%d, %s", layout.rows(), layout.cols(), str(layout.order())); |
| int inner_size = |
| layout.order() == Order::kRowMajor ? layout.cols() : layout.rows(); |
| if (inner_size != layout.stride()) { |
| s += formatted(", stride=%d", layout.stride()); |
| } else { |
| s += formatted(", unstrided"); |
| } |
| return s; |
| } |
| |
| inline std::string str(const MatLayout& layout) { |
| std::string s = |
| formatted("%dx%d, %s", layout.rows, layout.cols, str(layout.order)); |
| int inner_size = layout.order == Order::kRowMajor ? layout.cols : layout.rows; |
| if (inner_size != layout.stride) { |
| s += formatted(", stride=%d", layout.stride); |
| } else { |
| s += formatted(", unstrided"); |
| } |
| return s; |
| } |
| |
| inline std::string str(const PMatLayout& layout) { |
| std::string s = |
| formatted("%dx%d, %s", layout.rows, layout.cols, str(layout.order)); |
| int inner_size = layout.order == Order::kRowMajor ? layout.cols : layout.rows; |
| if (inner_size != layout.stride) { |
| s += formatted(", stride=%d", layout.stride); |
| } else { |
| s += formatted(", unstrided"); |
| } |
| s += formatted(", kernel blocks: %dx%d %s", layout.kernel.rows, |
| layout.kernel.cols, str(layout.kernel.order)); |
| return s; |
| } |
| |
| template <typename T> |
| std::string str() { |
| return "<unknown type>"; |
| } |
| #define RUY_IMPL_STR_TYPE_STD(T) \ |
| template <> \ |
| inline std::string str<std::T>() { \ |
| return #T; \ |
| } |
| #define RUY_IMPL_STR_TYPE(T) \ |
| template <> \ |
| inline std::string str<T>() { \ |
| return #T; \ |
| } |
| |
| RUY_IMPL_STR_TYPE(float) |
| RUY_IMPL_STR_TYPE(double) |
| RUY_IMPL_STR_TYPE_STD(int8_t) |
| RUY_IMPL_STR_TYPE_STD(uint8_t) |
| RUY_IMPL_STR_TYPE_STD(int16_t) |
| RUY_IMPL_STR_TYPE_STD(uint16_t) |
| RUY_IMPL_STR_TYPE_STD(int32_t) |
| RUY_IMPL_STR_TYPE_STD(uint32_t) |
| RUY_IMPL_STR_TYPE_STD(int64_t) |
| RUY_IMPL_STR_TYPE_STD(uint64_t) |
| |
| template <typename T> |
| std::string str(const Matrix<T>& matrix) { |
| std::string s = formatted("Matrix<%s>, %s", str<T>(), str(matrix.layout())); |
| if (matrix.zero_point()) { |
| s += formatted(", zero_point=%d", static_cast<int>(matrix.zero_point())); |
| } |
| if (matrix.cache_policy() != CachePolicy::kNeverCache) { |
| s += |
| formatted(", cache_policy=%d", static_cast<int>(matrix.cache_policy())); |
| } |
| return s; |
| } |
| |
| inline std::string str(const Type& type) { |
| char c; |
| if (type.is_floating_point) { |
| c = 'f'; |
| } else if (type.is_signed) { |
| c = 'i'; |
| } else { |
| c = 'u'; |
| } |
| return formatted("%c%d", c, type.size * 8); |
| } |
| |
| inline std::string str(const EMat& mat) { |
| std::string s = |
| formatted("EMat, data_type=%s, %s", str(mat.data_type), str(mat.layout)); |
| if (mat.zero_point) { |
| s += formatted(", zero_point=%d", static_cast<int>(mat.zero_point)); |
| } |
| if (mat.cache_policy != CachePolicy::kNeverCache) { |
| s += formatted(", cache_policy=%d", static_cast<int>(mat.cache_policy)); |
| } |
| return s; |
| } |
| |
| inline std::string str(const PEMat& mat) { |
| std::string s = |
| formatted("PEMat, data_type=%s, %s", str(mat.data_type), str(mat.layout)); |
| if (mat.zero_point) { |
| s += formatted(", zero_point=%d", static_cast<int>(mat.zero_point)); |
| } |
| return s; |
| } |
| |
| inline std::string str(Path paths) { |
| bool first = true; |
| std::string s; |
| for (int bit = 0; bit < 16; bit++) { |
| Path cur_path = static_cast<Path>(1 << bit); |
| if ((paths & cur_path) != Path::kNone) { |
| if (!first) { |
| s += " | "; |
| } |
| first = false; |
| switch (cur_path) { |
| case Path::kNone: |
| continue; |
| #define RUY_HANDLE_PATH(p) \ |
| case Path::p: \ |
| s += #p; \ |
| break; |
| RUY_HANDLE_PATH(kStandardCpp) |
| RUY_HANDLE_PATH(kInternalStandardCppVariant1) |
| RUY_HANDLE_PATH(kInternalStandardCppVariant2) |
| RUY_HANDLE_PATH(kInternalStandardCppVariant3) |
| #if RUY_PLATFORM_ARM |
| RUY_HANDLE_PATH(kNeon) |
| RUY_HANDLE_PATH(kNeonDotprod) |
| #endif // RUY_PLATFORM_ARM |
| #if RUY_PLATFORM_X86 |
| RUY_HANDLE_PATH(kAvx) |
| RUY_HANDLE_PATH(kAvx2Fma) |
| RUY_HANDLE_PATH(kAvx512) |
| #endif // RUY_PLATFORM_X86 |
| #undef RUY_HANDLE_PATH |
| default: |
| fprintf(stderr, "Unhandled Path value 0x%x\n", |
| static_cast<int>(cur_path)); |
| abort(); |
| } |
| } |
| } |
| return s; |
| } |
| |
| // Implementation of RUY_TRACE_INFO(X) macros. |
| |
| #define RUY_TRACE_INFO_MUL \ |
| ThreadLocalTrace()->Write("CompiledPaths: %s", str(CompiledPaths)); \ |
| ThreadLocalTrace()->Write("LHS: %s", str(lhs)); \ |
| ThreadLocalTrace()->Write("RHS: %s", str(rhs)); \ |
| ThreadLocalTrace()->Write("Destination: %s", str(*dst)); |
| |
| #define RUY_TRACE_INFO_CREATE_TRMUL_PARAMS_TRANSPOSING \ |
| ThreadLocalTrace()->Write("Canonicalizing to column-major destination:"); \ |
| ThreadLocalTrace()->Write( \ |
| "Swapping LHS<->RHS and flipping all storage orders."); |
| |
| #define RUY_TRACE_INFO_CREATE_TRMUL_PARAMS_ASSUMING_COLMAJOR_DST \ |
| ThreadLocalTrace()->Write("Runtime-selected path: %s", str(the_path)); \ |
| ThreadLocalTrace()->Write("LHS: %s", str(params->src[Side::kLhs])); \ |
| ThreadLocalTrace()->Write("RHS: %s", str(params->src[Side::kRhs])); \ |
| ThreadLocalTrace()->Write("Destination: %s", str(params->dst)); |
| |
| #define RUY_TRACE_INFO_POPULATE_TRMUL_PARAMS \ |
| ThreadLocalTrace()->Write( \ |
| "Here we have this Path as a template parameter: %s", str(ThePath)); \ |
| ThreadLocalTrace()->Write("PackedLhsScalar: %s", str<PackedLhsScalar>()); \ |
| ThreadLocalTrace()->Write("PackedRhsScalar: %s", str<PackedRhsScalar>()); \ |
| ThreadLocalTrace()->Write("Kernel function pointer: %p", \ |
| params->run_kernel); \ |
| ThreadLocalTrace()->Write("Kernel LHS block layout: %dx%d %s", \ |
| LhsKernelLayout::kRows, LhsKernelLayout::kCols, \ |
| str(LhsKernelLayout::kOrder)); \ |
| ThreadLocalTrace()->Write("Kernel RHS block layout: %dx%d %s", \ |
| RhsKernelLayout::kRows, RhsKernelLayout::kCols, \ |
| str(RhsKernelLayout::kOrder)); \ |
| ThreadLocalTrace()->Write("Created packed matrices:"); \ |
| ThreadLocalTrace()->Write("Packed LHS matrix: %s", \ |
| str(params->packed_matrix[Side::kLhs])); \ |
| ThreadLocalTrace()->Write("Packed RHS matrix: %s", \ |
| str(params->packed_matrix[Side::kRhs])); \ |
| ThreadLocalTrace()->Write("LHS packing function pointer: %p", \ |
| params->run_pack[Side::kLhs]); \ |
| ThreadLocalTrace()->Write("RHS packing function pointer: %p", \ |
| params->run_pack[Side::kRhs]); |
| |
| #define RUY_TRACE_INFO_GET_RUNTIME_ENABLED_PATHS_USING_SET_VALUE \ |
| ThreadLocalTrace()->Write("SetRuntimeEnabledPaths forcing paths: %s", \ |
| str(*paths)); |
| |
| #define RUY_TRACE_INFO_GET_RUNTIME_ENABLED_PATHS_USING_ENV_VAR \ |
| ThreadLocalTrace()->Write("Environment variable forcing paths: %s", \ |
| str(*paths)); |
| |
| #define RUY_TRACE_INFO_GET_RUNTIME_ENABLED_PATHS_USING_DETECTION \ |
| ThreadLocalTrace()->Write( \ |
| "Runtime-detected paths: %s", \ |
| str(*paths & ~kNonArchPathsIncludingInternalVariants)); |
| |
| #define RUY_TRACE_INFO_PREPARE_PACKED_MATRICES_SHOULD_CACHE \ |
| ThreadLocalTrace()->Write( \ |
| "Caching the packed %s matrix. Already in cache: %s", str(side), \ |
| action == PrepackedCache::Action::kInsertedNewEntry ? "no" : "yes"); |
| |
| #define RUY_TRACE_INFO_PREPARE_PACKED_MATRICES_NO_CACHE \ |
| ThreadLocalTrace()->Write("Not caching the packed %s matrix.", str(side)); |
| |
| #define RUY_TRACE_INFO_GET_TENTATIVE_THREAD_COUNT \ |
| ThreadLocalTrace()->Write( \ |
| "tentative_thread_count=%d (determined based on shape %dx%dx%d and " \ |
| "max_num_threads=%d)", \ |
| tentative_thread_count, rows, depth, cols, ctx->max_num_threads()); |
| |
| #define RUY_TRACE_INFO_GET_USE_SIMPLE_LOOP_RETURNS_TRUE \ |
| ThreadLocalTrace()->Write( \ |
| "Choosing to use the simple loop code path in TrMul because of the " \ |
| "linear traversal and single thread."); |
| |
| #define RUY_TRACE_INFO_GET_USE_SIMPLE_LOOP_RETURNS_FALSE \ |
| ThreadLocalTrace()->Write( \ |
| "Choosing to use the general case code path in TrMul because of: %s", \ |
| tentative_thread_count > 1 ? "multi-threading" \ |
| : "non-linear traversal order"); |
| |
| #define RUY_TRACE_INFO_TRMUL_SIMPLE_LOOP \ |
| ThreadLocalTrace()->Write("Entering the simple loop code path of TrMul"); |
| |
| #define RUY_TRACE_INFO_TRMUL_GENERAL_CASE \ |
| ThreadLocalTrace()->Write("Entering the general case code path of TrMul"); |
| |
| #define RUY_TRACE_INFO_MAKE_BLOCK_MAP_START \ |
| ThreadLocalTrace()->Write("Kernel block: %dx%d", kernel_rows, kernel_cols); \ |
| ThreadLocalTrace()->Write( \ |
| "BlockMap shape: %dx%d (destination matrix shape rounded to next " \ |
| "kernel blocks)", \ |
| rows, cols); \ |
| ThreadLocalTrace()->Write( \ |
| "Rectangularness log2: %dx%d (powers of two factors bringing the shape " \ |
| "closest to square)", \ |
| rows_rectangularness_log2, cols_rectangularness_log2); \ |
| ThreadLocalTrace()->Write("Accumulation depth: %d", depth); \ |
| ThreadLocalTrace()->Write("LHS scalar type size: %d", lhs_scalar_size); \ |
| ThreadLocalTrace()->Write("RHS scalar type size: %d", rhs_scalar_size); \ |
| ThreadLocalTrace()->Write("Tentative thread count: %d", \ |
| tentative_thread_count); \ |
| ThreadLocalTrace()->Write( \ |
| "CPU cache params: local_cache_size=%d, last_level_cache_size=%d", \ |
| cpu_cache_params.local_cache_size, \ |
| cpu_cache_params.last_level_cache_size); \ |
| ThreadLocalTrace()->Write( \ |
| "For the sizes below, when rows!=cols, we always retain the min of the " \ |
| "two."); \ |
| ThreadLocalTrace()->Write("Kernel block size_log2: %d", kernel_size_log2); \ |
| ThreadLocalTrace()->Write( \ |
| "BlockMap size_log2: %d (destination matrix shape rounded to next " \ |
| "kernel blocks)", \ |
| size_log2); \ |
| ThreadLocalTrace()->Write( \ |
| "Now we will pick the optimal log2 of BlockMap block size"); |
| |
| #define RUY_TRACE_INFO_MAKE_BLOCK_MAP_EACH_TENTATIVE_BLOCK_SIZE \ |
| ThreadLocalTrace()->Write( \ |
| "For BlockMap block size_log2 %d, score=%d (" \ |
| "multithreading_score=%d + cache_locality_score=%d + " \ |
| "kernel_amortization_score=%d)", \ |
| block_size_log2, score, multithreading_score, cache_locality_score, \ |
| kernel_amortization_score); |
| |
| #define RUY_TRACE_INFO_MAKE_BLOCK_MAP_END \ |
| ThreadLocalTrace()->Write("Selecting BlockMap block size_log2: %d", \ |
| best_score_block_size_log2); \ |
| ThreadLocalTrace()->Write( \ |
| "BlockMap has %dx%d blocks, each of size between %dx%d and %dx%d.", \ |
| 1 << num_blocks_of_rows_log2, 1 << num_blocks_of_cols_log2, \ |
| block_map->small_block_dims[Side::kLhs], \ |
| block_map->small_block_dims[Side::kRhs], \ |
| block_map->small_block_dims[Side::kLhs] + \ |
| block_map->kernel_dims[Side::kLhs], \ |
| block_map->small_block_dims[Side::kRhs] + \ |
| block_map->kernel_dims[Side::kRhs]); \ |
| ThreadLocalTrace()->Write( \ |
| "The first %d rows of blocks have %d rows, the remaining ones have %d " \ |
| "rows ", \ |
| block_map->large_blocks[Side::kLhs], \ |
| block_map->small_block_dims[Side::kLhs] + \ |
| block_map->kernel_dims[Side::kLhs], \ |
| block_map->small_block_dims[Side::kLhs]); \ |
| ThreadLocalTrace()->Write( \ |
| "The first %d columns of blocks have %d columns, the remaining ones " \ |
| "have %d columns ", \ |
| block_map->large_blocks[Side::kRhs], \ |
| block_map->small_block_dims[Side::kRhs] + \ |
| block_map->kernel_dims[Side::kLhs], \ |
| block_map->small_block_dims[Side::kRhs]); \ |
| ThreadLocalTrace()->Write( \ |
| "Traversal order: %s", \ |
| block_map->traversal_order == BlockMapTraversalOrder::kLinear ? "linear" \ |
| : block_map->traversal_order == BlockMapTraversalOrder::kFractalZ \ |
| ? "fractal Z-curve" \ |
| : block_map->traversal_order == BlockMapTraversalOrder::kFractalU \ |
| ? "fractal U-curve" \ |
| : block_map->traversal_order == BlockMapTraversalOrder::kFractalHilbert \ |
| ? "fractal Hilbert curve" \ |
| : nullptr); \ |
| ThreadLocalTrace()->Write("Finalized thread count: %d", \ |
| block_map->thread_count); |
| |
| #define RUY_TRACE_SET_THEAD_ID(thread_id) \ |
| ThreadLocalTrace()->set_thread_id(thread_id); |
| |
| #define RUY_TRACE_INFO_TRMUL_TASK_MAIN_LOOP_GOT_BLOCK_COORDS \ |
| ThreadLocalTrace()->Write( \ |
| "Block #%d is at position (%d, %d) in the BlockMap.", block_id, \ |
| block[Side::kLhs], block[Side::kRhs]); \ |
| ThreadLocalTrace()->Write( \ |
| "Block #%d has shape %dx%d and starts at position (%d, %d) in the " \ |
| "destination matrix.", \ |
| block_id, end[Side::kLhs] - start[Side::kLhs], \ |
| end[Side::kRhs] - start[Side::kRhs], start[Side::kLhs], \ |
| start[Side::kRhs]); \ |
| ThreadLocalTrace()->Write( \ |
| "Block #%d depends on LHS panel #%d and RHS panel #%d.", block_id, \ |
| block[Side::kLhs], block[Side::kRhs]); |
| |
| #define RUY_TRACE_INFO_TRYPACK_PACKING \ |
| ThreadLocalTrace()->Write( \ |
| "%s panel #%d is not already packed. Packing it now.", str(side), \ |
| block); |
| |
| #define RUY_TRACE_INFO_TRYPACK_ANOTHER_THREAD_PACKING \ |
| if (!ThreadLocalTrace()->is_in_run_ahead_packing_loop()) { \ |
| ThreadLocalTrace()->Write( \ |
| "%s panel #%d is currently being packed by another thread.", \ |
| str(side), block); \ |
| } |
| |
| #define RUY_TRACE_INFO_TRYPACK_PREVIOUSLY_PACKED \ |
| if (!ThreadLocalTrace()->is_in_run_ahead_packing_loop()) { \ |
| ThreadLocalTrace()->Write("%s panel #%d had previously been packed.", \ |
| str(side), block); \ |
| } |
| |
| #define RUY_TRACE_INFO_TRYPACK_PACKED_BY_ANOTHER_THREAD \ |
| ThreadLocalTrace()->Write( \ |
| "%s panel #%d has just been packed by another thread.", str(side), \ |
| block); |
| |
| #define RUY_TRACE_INFO_ENSURE_PACKED_ENTER_RUN_AHEAD \ |
| if (!ThreadLocalTrace()->is_in_run_ahead_packing_loop()) { \ |
| ThreadLocalTrace()->set_is_in_run_ahead_packing_loop(true); \ |
| ThreadLocalTrace()->Write( \ |
| "We're blocked on other threads packing the panels that we need. " \ |
| "Packing some other panels while we wait..."); \ |
| } |
| |
| #define RUY_TRACE_INFO_ENSURE_PACKED_END \ |
| if (ThreadLocalTrace()->is_in_run_ahead_packing_loop()) { \ |
| ThreadLocalTrace()->set_is_in_run_ahead_packing_loop(false); \ |
| ThreadLocalTrace()->Write( \ |
| "Other threads have finished packing what we were waiting for."); \ |
| } |
| |
| #define RUY_TRACE_INFO_RUN_PACK \ |
| ThreadLocalTrace()->Write("Path: %s", str(ThePath)); \ |
| ThreadLocalTrace()->Write("Packing panel consisting of columns [%d, %d)", \ |
| start_col, end_col); \ |
| ThreadLocalTrace()->Write("Source: columns [%d, %d) of %s", start_col, \ |
| end_col, str(src_matrix)); \ |
| ThreadLocalTrace()->Write("Destination: columns [%d, %d) of %s", start_col, \ |
| end_col, str(*packed_matrix)); \ |
| if (end_col > src_matrix.layout.cols) { \ |
| ThreadLocalTrace()->Write( \ |
| "This runs past the last column of the source matrix. Padding as " \ |
| "needed."); \ |
| } \ |
| if (packed_matrix->layout.rows > src_matrix.layout.rows) { \ |
| ThreadLocalTrace()->Write( \ |
| "The packed matrix has more rows than the source matrix due to " \ |
| "rounding up to the kernel block size. Padding as needed."); \ |
| } |
| |
| #define RUY_TRACE_INFO_RUN_KERNEL \ |
| { \ |
| ThreadLocalTrace()->Write("Path: %s", str(KernelArgs<KernelType>::kPath)); \ |
| int lhs_cols = end[Side::kLhs] - start[Side::kLhs]; \ |
| int rhs_cols = end[Side::kRhs] - start[Side::kRhs]; \ |
| int kernel_lhs_cols = src[Side::kLhs].layout.kernel.cols; \ |
| int kernel_rhs_cols = src[Side::kRhs].layout.kernel.cols; \ |
| ThreadLocalTrace()->Write("LHS: columns [%d, %d) of %s", \ |
| start[Side::kLhs], end[Side::kLhs], \ |
| str(src[Side::kLhs])); \ |
| ThreadLocalTrace()->Write("RHS: columns [%d, %d) of %s", \ |
| start[Side::kRhs], end[Side::kRhs], \ |
| str(src[Side::kRhs])); \ |
| ThreadLocalTrace()->Write("Destination: block [%d, %d)x[%d, %d) of %s", \ |
| start[Side::kLhs], end[Side::kLhs], \ |
| start[Side::kRhs], end[Side::kRhs], str(*dst)); \ |
| if (end[Side::kLhs] > dst->layout.rows || \ |
| end[Side::kRhs] > dst->layout.cols) { \ |
| ThreadLocalTrace()->Write( \ |
| "This runs over the destination matrix boundaries. The kernel will " \ |
| "internally clamp stores to avoid overruns."); \ |
| } \ |
| ThreadLocalTrace()->Write( \ |
| "The kernel's inner loop only produces a %dx%d block, so the " \ |
| "kernel's outer loops will run %dx%d times.", \ |
| kernel_lhs_cols, kernel_rhs_cols, lhs_cols / kernel_lhs_cols, \ |
| rhs_cols / kernel_rhs_cols); \ |
| } |
| |
| #define RUY_TRACE_INFO_THREAD_FUNC_IMPL_WAITING \ |
| ThreadLocalTrace()->Write("Waiting for a task..."); |
| |
| #define RUY_TRACE_INFO_THREADPOOL_EXECUTE_STARTING_TASK \ |
| ThreadLocalTrace()->Write("Sending task #%d to a worker thread...", i); |
| |
| #define RUY_TRACE_INFO_THREADPOOL_EXECUTE_STARTING_TASK_ZERO_ON_CUR_THREAD \ |
| ThreadLocalTrace()->Write("Running task #0 on the current thread..."); |
| |
| #define RUY_TRACE_INFO_THREADPOOL_EXECUTE_WAITING_FOR_THREADS \ |
| ThreadLocalTrace()->Write("Waiting for worker threads to finish.."); |
| |
| #define RUY_TRACE_INFO(id) \ |
| [=]() { \ |
| ThreadLocalTrace()->set_current_source_file(__FILE__); \ |
| ThreadLocalTrace()->set_current_source_line(__LINE__); \ |
| RUY_TRACE_INFO_##id \ |
| }() |
| |
| } // namespace ruy |
| |
| #else |
| |
| // Vacuous implementation when RUY_TRACE is not defined. |
| #define RUY_TRACE_SCOPE_NAME(name) |
| #define RUY_TRACE_SCOPE |
| #define RUY_TRACE_SET_THEAD_ID(thread_id) |
| #define RUY_TRACE_INFO(id) |
| |
| #endif |
| |
| #endif // RUY_RUY_TRACE_H_ |