Issue

The following scenarios are frequently used in asynchronous programming.

channel tx/rx;
mutex lock/unlock;
async task spawn;

So I ran some comparison tests on a lower performance cloud host (equivalent to j1900) as follows. I found that the performance of rust-tokio is very, very poor compared to go-lang.

Is there any parameter that needs to be adjusted? Can a single thread executor improve it?

Results.

tx/rx, time per op: go-lang: 112 ns,

tokio::sync::mpsc::channel: 7387 ns;

std::sync::channel: 2705 ns,

crossbean: 1062 ns.
mutex lock/unlock, per op:

tokio::sync::Mutex 4051 ns

std::sync::Mutex 321 ns
spawn (not join), per op:

tokio::spawn: 8445 ns

Rust tokio test tx/rx on channel

    #[tokio::test]
    async fn test_chan_benchmark() {
        let count = 100_000;
        let (tx, mut rx) = tokio::sync::mpsc::channel(10000);
        let start = std::time::SystemTime::now();
        let handle = tokio::spawn(async move {
            loop {
                let i = rx.recv().await.unwrap();
                if i == count - 1 {
                    break;
                }
            }
        });

        for i in 0..count {
            tx.send(i).await.unwrap();
        }
        drop(tx);

        handle.await.unwrap();
        let stop = std::time::SystemTime::now();
        let dur = stop.duration_since(start).unwrap();
        println!(
            "count={count}, cosume={}ms, ops={}ns",
            dur.as_millis(),
            dur.as_nanos() / count as u128,
        );
    }

Go channel tx/rx:

func TestChanPerformance(t *testing.T) {
    count := 1000000
    ch := make(chan int, count)
    rsp := make(chan int, 1)
    t1 := time.Now()
    go func() {
        for {
            if _, ok := <-ch; !ok {
                rsp <- 0
                break
            }
        }
    }()
    for i := 0; i < count; i++ {
        ch <- i
    }
    close(ch)
    <-rsp

    d := time.Since(t1)
    t.Logf("txrx %d times consumed %d ms, %d nspo", count, d.Milliseconds(), d.Nanoseconds()/int64(count))
}

Mutex test:

    #[tokio::test]
    async fn bench_std_mutex() {
        for count in [1_000, 10_000, 100_000] {
            let start = std::time::SystemTime::now();

            let under = Arc::new(std::sync::Mutex::new(0));
            for _ in 0..count {
                let _ = under.lock().unwrap();
            }

            let stop = std::time::SystemTime::now();
            let dur = stop.duration_since(start).unwrap();
            println!(
                "count={count}, cosume={}ms, ops={}ns",
                dur.as_millis(),
                dur.as_nanos() / count as u128,
            );
        }
    }

Tokio spawn test:

    #[tokio::test]
    async fn bench_tokio_spawn() {
        let count = 100_000;
        //let mut ths = Vec::with_capacity(count);
        let start = std::time::SystemTime::now();
        for _ in 0..count {
            tokio::spawn(async move {});
        }
        let stop = std::time::SystemTime::now();
        let dur = stop.duration_since(start).unwrap();
        //for _ in 0..count {
        //    ths.pop().unwrap().await.unwrap();
        //}
        // do not wait for join, just spawn
        println!(
            "count={count}, cosume={}ms, ops={}ns",
            dur.as_millis(),
            dur.as_nanos() / count as u128,
        );
    }

=============UPDATED=========== For --release:

std::mpsc::Mutex: 13ns;
tokio::mpsc::Mutex: 130ns;
std::mpsc::channel: 200ns;
tokio::mpsc::channel: 256ns;
tokio::spawn: 553ns;

Solution

Add --release to instruct the compiler to perform optimizations.

To demonstrate just how much of a difference this makes, here is a simple add function compiled with and without optimizations:

pub fn add(a: u32, b: u32) -> u32 {
    a + b
}

with optimizations:

example::add:
        lea     eax, [rdi + rsi]
        ret

without optimizations:

example::add:
        push    rax
        add     edi, esi
        mov     dword ptr [rsp + 4], edi
        setb    al
        test    al, 1
        jne     .LBB0_2
        mov     eax, dword ptr [rsp + 4]
        pop     rcx
        ret
.LBB0_2:
        lea     rdi, [rip + str.0]
        lea     rdx, [rip + .L__unnamed_1]
        mov     rax, qword ptr [rip + core::panicking::panic@GOTPCREL]
        mov     esi, 28
        call    rax
        ud2

.L__unnamed_2:
        .ascii  "/app/example.rs"

.L__unnamed_1:
        .quad   .L__unnamed_2
        .asciz  "\017\000\000\000\000\000\000\000\002\000\000\000\005\000\000"

str.0:
        .ascii  "attempt to add with overflow"

Note that the optimized version does no longer contain an overflow check. The overflow check is very useful during debugging, but also very slow.

Answered By - Finomnis

Answer Checked By - Clifford M. (PHPFixing Volunteer)

Tuesday, November 1, 2022

[FIXED] Why is the performance of rust tokio so poor? [release test result updated]

Issue

Solution

0 Comments:

Post a Comment

Total Pageviews

Featured Post

Why Learn PHP Programming

Tuesday, November 1, 2022

Issue

Solution

0 Comments:

Post a Comment

Total Pageviews

Featured Post

Why Learn PHP Programming

Subscribe To