2022-12-12 23:31:21 -05:00
|
|
|
// Copyright (c) 2015-2022 MinIO, Inc.
|
|
|
|
//
|
2023-07-08 13:42:05 -04:00
|
|
|
// This file is part of MinIO Object Storage stack
|
2022-12-12 23:31:21 -05:00
|
|
|
//
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
// (at your option) any later version.
|
|
|
|
//
|
|
|
|
// This program is distributed in the hope that it will be useful
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU Affero General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
package http
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"crypto/tls"
|
|
|
|
"crypto/x509"
|
|
|
|
"net/http"
|
|
|
|
"syscall"
|
|
|
|
"time"
|
|
|
|
|
2023-09-04 15:57:37 -04:00
|
|
|
"github.com/minio/pkg/v2/certs"
|
2022-12-12 23:31:21 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
// tlsClientSessionCacheSize is the cache size for client sessions.
|
|
|
|
var tlsClientSessionCacheSize = 100
|
|
|
|
|
Add PutObject Ring Buffer (#19605)
Replace the `io.Pipe` from streamingBitrotWriter -> CreateFile with a fixed size ring buffer.
This will add an output buffer for encoded shards to be written to disk - potentially via RPC.
This will remove blocking when `(*streamingBitrotWriter).Write` is called, and it writes hashes and data.
With current settings, the write looks like this:
```
Outbound
┌───────────────────┐ ┌────────────────┐ ┌───────────────┐ ┌────────────────┐
│ │ Parr. │ │ (http body) │ │ │ │
│ Bitrot Hash │ Write │ Pipe │ Read │ HTTP buffer │ Write (syscall) │ TCP Buffer │
│ Erasure Shard │ ──────────► │ (unbuffered) │ ────────────► │ (64K Max) │ ───────────────────► │ (4MB) │
│ │ │ │ │ (io.Copy) │ │ │
└───────────────────┘ └────────────────┘ └───────────────┘ └────────────────┘
```
We write a Hash (32 bytes). Since the pipe is unbuffered, it will block until the 32 bytes have
been delivered to the TCP buffer, and the next Read hits the Pipe.
Then we write the shard data. This will typically be bigger than 64KB, so it will block until two blocks
have been read from the pipe.
When we insert a ring buffer:
```
Outbound
┌───────────────────┐ ┌────────────────┐ ┌───────────────┐ ┌────────────────┐
│ │ │ │ (http body) │ │ │ │
│ Bitrot Hash │ Write │ Ring Buffer │ Read │ HTTP buffer │ Write (syscall) │ TCP Buffer │
│ Erasure Shard │ ──────────► │ (2MB) │ ────────────► │ (64K Max) │ ───────────────────► │ (4MB) │
│ │ │ │ │ (io.Copy) │ │ │
└───────────────────┘ └────────────────┘ └───────────────┘ └────────────────┘
```
The hash+shard will fit within the ring buffer, so writes will not block - but will complete after a
memcopy. Reads can fill the 64KB buffer if there is data for it.
If the network is congested, the ring buffer will become filled, and all syscalls will be on full buffers.
Only when the ring buffer is filled will erasure coding start blocking.
Since there is always "space" to write output data, we remove the parallel writing since we are
always writing to memory now, and the goroutine synchronization overhead probably not worth taking.
If the output were blocked in the existing, we would still wait for it to unblock in parallel write, so it would
make no difference there - except now the ring buffer smoothes out the load.
There are some micro-optimizations we could look at later. The biggest is that, in most cases,
we could encode directly to the ring buffer - if we are not at a boundary. Also, "force filling" the
Read requests (i.e., blocking until a full read can be completed) could be investigated and maybe
allow concurrent memory on read and write.
2024-05-14 20:11:04 -04:00
|
|
|
const (
|
|
|
|
WriteBufferSize = 64 << 10 // WriteBufferSize 64KiB moving up from 4KiB default
|
|
|
|
ReadBufferSize = 64 << 10 // ReadBufferSize 64KiB moving up from 4KiB default
|
|
|
|
)
|
|
|
|
|
2022-12-12 23:31:21 -05:00
|
|
|
// ConnSettings - contains connection settings.
|
|
|
|
type ConnSettings struct {
|
2023-07-03 15:30:51 -04:00
|
|
|
DialContext DialContext // Custom dialContext, DialTimeout is ignored if this is already setup.
|
|
|
|
LookupHost LookupHost // Custom lookupHost, is nil on containerized deployments.
|
2022-12-12 23:31:21 -05:00
|
|
|
DialTimeout time.Duration
|
|
|
|
|
|
|
|
// TLS Settings
|
|
|
|
RootCAs *x509.CertPool
|
|
|
|
CipherSuites []uint16
|
|
|
|
CurvePreferences []tls.CurveID
|
|
|
|
|
|
|
|
// HTTP2
|
|
|
|
EnableHTTP2 bool
|
2023-05-03 17:12:25 -04:00
|
|
|
|
|
|
|
// TCP Options
|
|
|
|
TCPOptions TCPOptions
|
2022-12-12 23:31:21 -05:00
|
|
|
}
|
|
|
|
|
2024-01-29 19:50:37 -05:00
|
|
|
func (s ConnSettings) getDefaultTransport(maxIdleConnsPerHost int) *http.Transport {
|
|
|
|
if maxIdleConnsPerHost <= 0 {
|
|
|
|
maxIdleConnsPerHost = 1024
|
|
|
|
}
|
|
|
|
|
2022-12-12 23:31:21 -05:00
|
|
|
dialContext := s.DialContext
|
|
|
|
if dialContext == nil {
|
2023-07-03 15:30:51 -04:00
|
|
|
dialContext = DialContextWithLookupHost(s.LookupHost, NewInternodeDialContext(s.DialTimeout, s.TCPOptions))
|
2022-12-12 23:31:21 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
tlsClientConfig := tls.Config{
|
|
|
|
RootCAs: s.RootCAs,
|
|
|
|
CipherSuites: s.CipherSuites,
|
|
|
|
CurvePreferences: s.CurvePreferences,
|
|
|
|
ClientSessionCache: tls.NewLRUClientSessionCache(tlsClientSessionCacheSize),
|
|
|
|
}
|
|
|
|
|
|
|
|
// For more details about various values used here refer
|
|
|
|
// https://golang.org/pkg/net/http/#Transport documentation
|
|
|
|
tr := &http.Transport{
|
|
|
|
Proxy: http.ProxyFromEnvironment,
|
|
|
|
DialContext: dialContext,
|
2024-01-29 19:50:37 -05:00
|
|
|
MaxIdleConnsPerHost: maxIdleConnsPerHost,
|
Add PutObject Ring Buffer (#19605)
Replace the `io.Pipe` from streamingBitrotWriter -> CreateFile with a fixed size ring buffer.
This will add an output buffer for encoded shards to be written to disk - potentially via RPC.
This will remove blocking when `(*streamingBitrotWriter).Write` is called, and it writes hashes and data.
With current settings, the write looks like this:
```
Outbound
┌───────────────────┐ ┌────────────────┐ ┌───────────────┐ ┌────────────────┐
│ │ Parr. │ │ (http body) │ │ │ │
│ Bitrot Hash │ Write │ Pipe │ Read │ HTTP buffer │ Write (syscall) │ TCP Buffer │
│ Erasure Shard │ ──────────► │ (unbuffered) │ ────────────► │ (64K Max) │ ───────────────────► │ (4MB) │
│ │ │ │ │ (io.Copy) │ │ │
└───────────────────┘ └────────────────┘ └───────────────┘ └────────────────┘
```
We write a Hash (32 bytes). Since the pipe is unbuffered, it will block until the 32 bytes have
been delivered to the TCP buffer, and the next Read hits the Pipe.
Then we write the shard data. This will typically be bigger than 64KB, so it will block until two blocks
have been read from the pipe.
When we insert a ring buffer:
```
Outbound
┌───────────────────┐ ┌────────────────┐ ┌───────────────┐ ┌────────────────┐
│ │ │ │ (http body) │ │ │ │
│ Bitrot Hash │ Write │ Ring Buffer │ Read │ HTTP buffer │ Write (syscall) │ TCP Buffer │
│ Erasure Shard │ ──────────► │ (2MB) │ ────────────► │ (64K Max) │ ───────────────────► │ (4MB) │
│ │ │ │ │ (io.Copy) │ │ │
└───────────────────┘ └────────────────┘ └───────────────┘ └────────────────┘
```
The hash+shard will fit within the ring buffer, so writes will not block - but will complete after a
memcopy. Reads can fill the 64KB buffer if there is data for it.
If the network is congested, the ring buffer will become filled, and all syscalls will be on full buffers.
Only when the ring buffer is filled will erasure coding start blocking.
Since there is always "space" to write output data, we remove the parallel writing since we are
always writing to memory now, and the goroutine synchronization overhead probably not worth taking.
If the output were blocked in the existing, we would still wait for it to unblock in parallel write, so it would
make no difference there - except now the ring buffer smoothes out the load.
There are some micro-optimizations we could look at later. The biggest is that, in most cases,
we could encode directly to the ring buffer - if we are not at a boundary. Also, "force filling" the
Read requests (i.e., blocking until a full read can be completed) could be investigated and maybe
allow concurrent memory on read and write.
2024-05-14 20:11:04 -04:00
|
|
|
WriteBufferSize: WriteBufferSize,
|
|
|
|
ReadBufferSize: ReadBufferSize,
|
2022-12-12 23:31:21 -05:00
|
|
|
IdleConnTimeout: 15 * time.Second,
|
|
|
|
ResponseHeaderTimeout: 15 * time.Minute, // Conservative timeout is the default (for MinIO internode)
|
|
|
|
TLSHandshakeTimeout: 10 * time.Second,
|
|
|
|
TLSClientConfig: &tlsClientConfig,
|
|
|
|
ForceAttemptHTTP2: s.EnableHTTP2,
|
|
|
|
// Go net/http automatically unzip if content-type is
|
|
|
|
// gzip disable this feature, as we are always interested
|
|
|
|
// in raw stream.
|
|
|
|
DisableCompression: true,
|
|
|
|
}
|
|
|
|
|
|
|
|
// https://github.com/golang/go/issues/23559
|
|
|
|
// https://github.com/golang/go/issues/42534
|
|
|
|
// https://github.com/golang/go/issues/43989
|
|
|
|
// https://github.com/golang/go/issues/33425
|
|
|
|
// https://github.com/golang/go/issues/29246
|
|
|
|
// if tlsConfig != nil {
|
|
|
|
// trhttp2, _ := http2.ConfigureTransports(tr)
|
|
|
|
// if trhttp2 != nil {
|
|
|
|
// // ReadIdleTimeout is the timeout after which a health check using ping
|
|
|
|
// // frame will be carried out if no frame is received on the
|
|
|
|
// // connection. 5 minutes is sufficient time for any idle connection.
|
|
|
|
// trhttp2.ReadIdleTimeout = 5 * time.Minute
|
|
|
|
// // PingTimeout is the timeout after which the connection will be closed
|
|
|
|
// // if a response to Ping is not received.
|
|
|
|
// trhttp2.PingTimeout = dialTimeout
|
|
|
|
// // DisableCompression, if true, prevents the Transport from
|
|
|
|
// // requesting compression with an "Accept-Encoding: gzip"
|
|
|
|
// trhttp2.DisableCompression = true
|
|
|
|
// }
|
|
|
|
// }
|
|
|
|
|
|
|
|
return tr
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewInternodeHTTPTransport returns transport for internode MinIO connections.
|
2024-01-29 19:50:37 -05:00
|
|
|
func (s ConnSettings) NewInternodeHTTPTransport(maxIdleConnsPerHost int) func() http.RoundTripper {
|
|
|
|
tr := s.getDefaultTransport(maxIdleConnsPerHost)
|
2022-12-12 23:31:21 -05:00
|
|
|
|
|
|
|
// Settings specific to internode requests.
|
2023-01-17 23:36:38 -05:00
|
|
|
tr.TLSHandshakeTimeout = 15 * time.Second
|
2022-12-12 23:31:21 -05:00
|
|
|
|
|
|
|
return func() http.RoundTripper {
|
|
|
|
return tr
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewCustomHTTPProxyTransport is used only for proxied requests, specifically
|
|
|
|
// only supports HTTP/1.1
|
|
|
|
func (s ConnSettings) NewCustomHTTPProxyTransport() func() *http.Transport {
|
|
|
|
s.EnableHTTP2 = false
|
2024-01-29 19:50:37 -05:00
|
|
|
tr := s.getDefaultTransport(0)
|
2022-12-12 23:31:21 -05:00
|
|
|
|
|
|
|
// Settings specific to proxied requests.
|
|
|
|
tr.ResponseHeaderTimeout = 30 * time.Minute
|
|
|
|
|
|
|
|
return func() *http.Transport {
|
|
|
|
return tr
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewHTTPTransportWithTimeout allows setting a timeout for response headers
|
|
|
|
func (s ConnSettings) NewHTTPTransportWithTimeout(timeout time.Duration) *http.Transport {
|
2024-01-29 19:50:37 -05:00
|
|
|
tr := s.getDefaultTransport(0)
|
2022-12-12 23:31:21 -05:00
|
|
|
|
|
|
|
// Settings specific to this transport.
|
|
|
|
tr.ResponseHeaderTimeout = timeout
|
|
|
|
return tr
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewHTTPTransportWithClientCerts returns a new http configuration used for
|
|
|
|
// communicating with client cert authentication.
|
|
|
|
func (s ConnSettings) NewHTTPTransportWithClientCerts(ctx context.Context, clientCert, clientKey string) (*http.Transport, error) {
|
|
|
|
transport := s.NewHTTPTransportWithTimeout(1 * time.Minute)
|
|
|
|
if clientCert != "" && clientKey != "" {
|
|
|
|
c, err := certs.NewManager(ctx, clientCert, clientKey, tls.LoadX509KeyPair)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if c != nil {
|
|
|
|
c.UpdateReloadDuration(10 * time.Second)
|
|
|
|
c.ReloadOnSignal(syscall.SIGHUP) // allow reloads upon SIGHUP
|
|
|
|
transport.TLSClientConfig.GetClientCertificate = c.GetClientCertificate
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return transport, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewRemoteTargetHTTPTransport returns a new http configuration
|
|
|
|
// used while communicating with the remote replication targets.
|
2023-06-30 11:04:13 -04:00
|
|
|
func (s ConnSettings) NewRemoteTargetHTTPTransport(insecure bool) func() *http.Transport {
|
2024-01-29 19:50:37 -05:00
|
|
|
tr := s.getDefaultTransport(0)
|
2022-12-12 23:31:21 -05:00
|
|
|
|
2023-06-30 11:04:13 -04:00
|
|
|
tr.TLSHandshakeTimeout = 10 * time.Second
|
2022-12-12 23:31:21 -05:00
|
|
|
tr.ResponseHeaderTimeout = 0
|
2023-06-30 11:04:13 -04:00
|
|
|
tr.TLSClientConfig.InsecureSkipVerify = insecure
|
2022-12-12 23:31:21 -05:00
|
|
|
|
|
|
|
return func() *http.Transport {
|
|
|
|
return tr
|
|
|
|
}
|
|
|
|
}
|