So I made some rough benchmarks to see what would happen and it seems like I’ve been able to answer my own question.
TLDR: Doing less math ops is better that doing more local ops. Seems that reducing the number of function calls should be the priority.
I think this result would suggest that Sum3 is going to be faster.
I tested
Duplicate
a + b + c;
a + b + d;
against No Duplicate.
r = a + b;
r + c;
r + d;
#include <span>
#include <array>
#include <vector>
#include <cstdint>
#include <iostream>
static constexpr size_t N = 1024;
using signal_span = std::span<float, N>;
using signal_store = std::array<float, 10'000>;
using Fn = void(*) (signal_span lhs, signal_span rhs, signal_span output);
void add(signal_span lhs, signal_span rhs, signal_span output) {
for(size_t i{0}; i < N; ++i)
output[i] = lhs[i] + rhs[i];
}
void mul(signal_span lhs, signal_span rhs, signal_span output) {
for(size_t i{0}; i < N; ++i)
output[i] = lhs[i] * rhs[i];
}
struct Instruction {
Fn func;
int16_t lhs;
int16_t rhs;
int16_t output;
};
static void duplicate(benchmark::State& state) {
float signal_store[1'000'000];
std::vector<Instruction> instr;
instr.emplace_back(add, 0, N, 2 * N); //a + b
instr.emplace_back(add, 0, N, 2 * N); // b + c
instr.emplace_back(add, 1000 + N, 1000 + N*2, 1000 + N*3); // a + b
instr.emplace_back(add, 10000 + N, 10000 + N*2, 10000 + N*3); // b + d
for (auto _ : state) {
for(auto& ins: instr){
ins.func(
std::span<float, N>(signal_store + ins.lhs, N),
std::span<float, N>(signal_store + ins.rhs, N),
std::span<float, N>(signal_store + ins.output, N)
);
}
benchmark::DoNotOptimize(instr);
}
float ff = 0.0;
for(auto& f : signal_store)
ff += f;
std::cout << ff << std::endl;
}
BENCHMARK(duplicate);
static void no_duplicate(benchmark::State& state) {
float signal_store[1'000'000];
std::vector<Instruction> instr;
instr.emplace_back(add, 0, N, N*2); // a + b
instr.emplace_back(add, N*2, 1000, 1000 + N); // b + c
instr.emplace_back(add, N*2, 100'000, 100'000 + N); // b + d assume big jump between input and output
for (auto _ : state) {
for(auto& ins: instr){
ins.func(
std::span<float, N>(signal_store + ins.lhs, N),
std::span<float, N>(signal_store + ins.rhs, N),
std::span<float, N>(signal_store + ins.output, N)
);
}
benchmark::DoNotOptimize(instr);
}
float ff = 0.0;
for(auto& f : signal_store)
ff += f;
std::cout << ff << std::endl;
}
BENCHMARK(no_duplicate);
Some things I learnt.
If the inputs and output pointers alias things are much slower, I guess the compiler adds a check for this.
Block size has no measureable effect.