Issue
The following scenarios are frequently used in asynchronous programming.
- channel tx/rx;
- mutex lock/unlock;
- async task spawn;
So I ran some comparison tests on a lower performance cloud host (equivalent to j1900) as follows. I found that the performance of rust-tokio is very, very poor compared to go-lang.
Is there any parameter that needs to be adjusted? Can a single thread executor improve it?
Results.
tx/rx, time per op: go-lang: 112 ns,
tokio::sync::mpsc::channel: 7387 ns;
std::sync::channel: 2705 ns,
crossbean: 1062 ns.
mutex lock/unlock, per op:
tokio::sync::Mutex 4051 ns
std::sync::Mutex 321 ns
spawn (not join), per op:
tokio::spawn: 8445 ns
Rust tokio test tx/rx on channel
#[tokio::test]
async fn test_chan_benchmark() {
let count = 100_000;
let (tx, mut rx) = tokio::sync::mpsc::channel(10000);
let start = std::time::SystemTime::now();
let handle = tokio::spawn(async move {
loop {
let i = rx.recv().await.unwrap();
if i == count - 1 {
break;
}
}
});
for i in 0..count {
tx.send(i).await.unwrap();
}
drop(tx);
handle.await.unwrap();
let stop = std::time::SystemTime::now();
let dur = stop.duration_since(start).unwrap();
println!(
"count={count}, cosume={}ms, ops={}ns",
dur.as_millis(),
dur.as_nanos() / count as u128,
);
}
Go channel tx/rx:
func TestChanPerformance(t *testing.T) {
count := 1000000
ch := make(chan int, count)
rsp := make(chan int, 1)
t1 := time.Now()
go func() {
for {
if _, ok := <-ch; !ok {
rsp <- 0
break
}
}
}()
for i := 0; i < count; i++ {
ch <- i
}
close(ch)
<-rsp
d := time.Since(t1)
t.Logf("txrx %d times consumed %d ms, %d nspo", count, d.Milliseconds(), d.Nanoseconds()/int64(count))
}
Mutex test:
#[tokio::test]
async fn bench_std_mutex() {
for count in [1_000, 10_000, 100_000] {
let start = std::time::SystemTime::now();
let under = Arc::new(std::sync::Mutex::new(0));
for _ in 0..count {
let _ = under.lock().unwrap();
}
let stop = std::time::SystemTime::now();
let dur = stop.duration_since(start).unwrap();
println!(
"count={count}, cosume={}ms, ops={}ns",
dur.as_millis(),
dur.as_nanos() / count as u128,
);
}
}
Tokio spawn test:
#[tokio::test]
async fn bench_tokio_spawn() {
let count = 100_000;
//let mut ths = Vec::with_capacity(count);
let start = std::time::SystemTime::now();
for _ in 0..count {
tokio::spawn(async move {});
}
let stop = std::time::SystemTime::now();
let dur = stop.duration_since(start).unwrap();
//for _ in 0..count {
// ths.pop().unwrap().await.unwrap();
//}
// do not wait for join, just spawn
println!(
"count={count}, cosume={}ms, ops={}ns",
dur.as_millis(),
dur.as_nanos() / count as u128,
);
}
=============UPDATED=========== For --release:
std::mpsc::Mutex: 13ns;
tokio::mpsc::Mutex: 130ns;
std::mpsc::channel: 200ns;
tokio::mpsc::channel: 256ns;
tokio::spawn: 553ns;
Solution
Add --release
to instruct the compiler to perform optimizations.
To demonstrate just how much of a difference this makes, here is a simple add
function compiled with and without optimizations:
pub fn add(a: u32, b: u32) -> u32 {
a + b
}
example::add:
lea eax, [rdi + rsi]
ret
example::add:
push rax
add edi, esi
mov dword ptr [rsp + 4], edi
setb al
test al, 1
jne .LBB0_2
mov eax, dword ptr [rsp + 4]
pop rcx
ret
.LBB0_2:
lea rdi, [rip + str.0]
lea rdx, [rip + .L__unnamed_1]
mov rax, qword ptr [rip + core::panicking::panic@GOTPCREL]
mov esi, 28
call rax
ud2
.L__unnamed_2:
.ascii "/app/example.rs"
.L__unnamed_1:
.quad .L__unnamed_2
.asciz "\017\000\000\000\000\000\000\000\002\000\000\000\005\000\000"
str.0:
.ascii "attempt to add with overflow"
Note that the optimized version does no longer contain an overflow check. The overflow check is very useful during debugging, but also very slow.
Answered By - Finomnis Answer Checked By - Clifford M. (PHPFixing Volunteer)
0 Comments:
Post a Comment
Note: Only a member of this blog may post a comment.