2015-10-19 23:11:32 -07:00

680 lines
19 KiB
Go

// mgo - MongoDB driver for Go
//
// Copyright (c) 2010-2012 - Gustavo Niemeyer <gustavo@niemeyer.net>
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package mgo
import (
"errors"
"fmt"
"net"
"strconv"
"strings"
"sync"
"time"
"gopkg.in/mgo.v2/bson"
)
// ---------------------------------------------------------------------------
// Mongo cluster encapsulation.
//
// A cluster enables the communication with one or more servers participating
// in a mongo cluster. This works with individual servers, a replica set,
// a replica pair, one or multiple mongos routers, etc.
type mongoCluster struct {
sync.RWMutex
serverSynced sync.Cond
userSeeds []string
dynaSeeds []string
servers mongoServers
masters mongoServers
references int
syncing bool
direct bool
failFast bool
syncCount uint
setName string
cachedIndex map[string]bool
sync chan bool
dial dialer
}
func newCluster(userSeeds []string, direct, failFast bool, dial dialer, setName string) *mongoCluster {
cluster := &mongoCluster{
userSeeds: userSeeds,
references: 1,
direct: direct,
failFast: failFast,
dial: dial,
setName: setName,
}
cluster.serverSynced.L = cluster.RWMutex.RLocker()
cluster.sync = make(chan bool, 1)
stats.cluster(+1)
go cluster.syncServersLoop()
return cluster
}
// Acquire increases the reference count for the cluster.
func (cluster *mongoCluster) Acquire() {
cluster.Lock()
cluster.references++
debugf("Cluster %p acquired (refs=%d)", cluster, cluster.references)
cluster.Unlock()
}
// Release decreases the reference count for the cluster. Once
// it reaches zero, all servers will be closed.
func (cluster *mongoCluster) Release() {
cluster.Lock()
if cluster.references == 0 {
panic("cluster.Release() with references == 0")
}
cluster.references--
debugf("Cluster %p released (refs=%d)", cluster, cluster.references)
if cluster.references == 0 {
for _, server := range cluster.servers.Slice() {
server.Close()
}
// Wake up the sync loop so it can die.
cluster.syncServers()
stats.cluster(-1)
}
cluster.Unlock()
}
func (cluster *mongoCluster) LiveServers() (servers []string) {
cluster.RLock()
for _, serv := range cluster.servers.Slice() {
servers = append(servers, serv.Addr)
}
cluster.RUnlock()
return servers
}
func (cluster *mongoCluster) removeServer(server *mongoServer) {
cluster.Lock()
cluster.masters.Remove(server)
other := cluster.servers.Remove(server)
cluster.Unlock()
if other != nil {
other.Close()
log("Removed server ", server.Addr, " from cluster.")
}
server.Close()
}
type isMasterResult struct {
IsMaster bool
Secondary bool
Primary string
Hosts []string
Passives []string
Tags bson.D
Msg string
SetName string `bson:"setName"`
MaxWireVersion int `bson:"maxWireVersion"`
}
func (cluster *mongoCluster) isMaster(socket *mongoSocket, result *isMasterResult) error {
// Monotonic let's it talk to a slave and still hold the socket.
session := newSession(Monotonic, cluster, 10*time.Second)
session.setSocket(socket)
err := session.Run("ismaster", result)
session.Close()
return err
}
type possibleTimeout interface {
Timeout() bool
}
var syncSocketTimeout = 5 * time.Second
func (cluster *mongoCluster) syncServer(server *mongoServer) (info *mongoServerInfo, hosts []string, err error) {
var syncTimeout time.Duration
if raceDetector {
// This variable is only ever touched by tests.
globalMutex.Lock()
syncTimeout = syncSocketTimeout
globalMutex.Unlock()
} else {
syncTimeout = syncSocketTimeout
}
addr := server.Addr
log("SYNC Processing ", addr, "...")
// Retry a few times to avoid knocking a server down for a hiccup.
var result isMasterResult
var tryerr error
for retry := 0; ; retry++ {
if retry == 3 || retry == 1 && cluster.failFast {
return nil, nil, tryerr
}
if retry > 0 {
// Don't abuse the server needlessly if there's something actually wrong.
if err, ok := tryerr.(possibleTimeout); ok && err.Timeout() {
// Give a chance for waiters to timeout as well.
cluster.serverSynced.Broadcast()
}
time.Sleep(syncShortDelay)
}
// It's not clear what would be a good timeout here. Is it
// better to wait longer or to retry?
socket, _, err := server.AcquireSocket(0, syncTimeout)
if err != nil {
tryerr = err
logf("SYNC Failed to get socket to %s: %v", addr, err)
continue
}
err = cluster.isMaster(socket, &result)
socket.Release()
if err != nil {
tryerr = err
logf("SYNC Command 'ismaster' to %s failed: %v", addr, err)
continue
}
debugf("SYNC Result of 'ismaster' from %s: %#v", addr, result)
break
}
if cluster.setName != "" && result.SetName != cluster.setName {
logf("SYNC Server %s is not a member of replica set %q", addr, cluster.setName)
return nil, nil, fmt.Errorf("server %s is not a member of replica set %q", addr, cluster.setName)
}
if result.IsMaster {
debugf("SYNC %s is a master.", addr)
if !server.info.Master {
// Made an incorrect assumption above, so fix stats.
stats.conn(-1, false)
stats.conn(+1, true)
}
} else if result.Secondary {
debugf("SYNC %s is a slave.", addr)
} else if cluster.direct {
logf("SYNC %s in unknown state. Pretending it's a slave due to direct connection.", addr)
} else {
logf("SYNC %s is neither a master nor a slave.", addr)
// Let stats track it as whatever was known before.
return nil, nil, errors.New(addr + " is not a master nor slave")
}
info = &mongoServerInfo{
Master: result.IsMaster,
Mongos: result.Msg == "isdbgrid",
Tags: result.Tags,
SetName: result.SetName,
MaxWireVersion: result.MaxWireVersion,
}
hosts = make([]string, 0, 1+len(result.Hosts)+len(result.Passives))
if result.Primary != "" {
// First in the list to speed up master discovery.
hosts = append(hosts, result.Primary)
}
hosts = append(hosts, result.Hosts...)
hosts = append(hosts, result.Passives...)
debugf("SYNC %s knows about the following peers: %#v", addr, hosts)
return info, hosts, nil
}
type syncKind bool
const (
completeSync syncKind = true
partialSync syncKind = false
)
func (cluster *mongoCluster) addServer(server *mongoServer, info *mongoServerInfo, syncKind syncKind) {
cluster.Lock()
current := cluster.servers.Search(server.ResolvedAddr)
if current == nil {
if syncKind == partialSync {
cluster.Unlock()
server.Close()
log("SYNC Discarding unknown server ", server.Addr, " due to partial sync.")
return
}
cluster.servers.Add(server)
if info.Master {
cluster.masters.Add(server)
log("SYNC Adding ", server.Addr, " to cluster as a master.")
} else {
log("SYNC Adding ", server.Addr, " to cluster as a slave.")
}
} else {
if server != current {
panic("addServer attempting to add duplicated server")
}
if server.Info().Master != info.Master {
if info.Master {
log("SYNC Server ", server.Addr, " is now a master.")
cluster.masters.Add(server)
} else {
log("SYNC Server ", server.Addr, " is now a slave.")
cluster.masters.Remove(server)
}
}
}
server.SetInfo(info)
debugf("SYNC Broadcasting availability of server %s", server.Addr)
cluster.serverSynced.Broadcast()
cluster.Unlock()
}
func (cluster *mongoCluster) getKnownAddrs() []string {
cluster.RLock()
max := len(cluster.userSeeds) + len(cluster.dynaSeeds) + cluster.servers.Len()
seen := make(map[string]bool, max)
known := make([]string, 0, max)
add := func(addr string) {
if _, found := seen[addr]; !found {
seen[addr] = true
known = append(known, addr)
}
}
for _, addr := range cluster.userSeeds {
add(addr)
}
for _, addr := range cluster.dynaSeeds {
add(addr)
}
for _, serv := range cluster.servers.Slice() {
add(serv.Addr)
}
cluster.RUnlock()
return known
}
// syncServers injects a value into the cluster.sync channel to force
// an iteration of the syncServersLoop function.
func (cluster *mongoCluster) syncServers() {
select {
case cluster.sync <- true:
default:
}
}
// How long to wait for a checkup of the cluster topology if nothing
// else kicks a synchronization before that.
const syncServersDelay = 30 * time.Second
const syncShortDelay = 500 * time.Millisecond
// syncServersLoop loops while the cluster is alive to keep its idea of
// the server topology up-to-date. It must be called just once from
// newCluster. The loop iterates once syncServersDelay has passed, or
// if somebody injects a value into the cluster.sync channel to force a
// synchronization. A loop iteration will contact all servers in
// parallel, ask them about known peers and their own role within the
// cluster, and then attempt to do the same with all the peers
// retrieved.
func (cluster *mongoCluster) syncServersLoop() {
for {
debugf("SYNC Cluster %p is starting a sync loop iteration.", cluster)
cluster.Lock()
if cluster.references == 0 {
cluster.Unlock()
break
}
cluster.references++ // Keep alive while syncing.
direct := cluster.direct
cluster.Unlock()
cluster.syncServersIteration(direct)
// We just synchronized, so consume any outstanding requests.
select {
case <-cluster.sync:
default:
}
cluster.Release()
// Hold off before allowing another sync. No point in
// burning CPU looking for down servers.
if !cluster.failFast {
time.Sleep(syncShortDelay)
}
cluster.Lock()
if cluster.references == 0 {
cluster.Unlock()
break
}
cluster.syncCount++
// Poke all waiters so they have a chance to timeout or
// restart syncing if they wish to.
cluster.serverSynced.Broadcast()
// Check if we have to restart immediately either way.
restart := !direct && cluster.masters.Empty() || cluster.servers.Empty()
cluster.Unlock()
if restart {
log("SYNC No masters found. Will synchronize again.")
time.Sleep(syncShortDelay)
continue
}
debugf("SYNC Cluster %p waiting for next requested or scheduled sync.", cluster)
// Hold off until somebody explicitly requests a synchronization
// or it's time to check for a cluster topology change again.
select {
case <-cluster.sync:
case <-time.After(syncServersDelay):
}
}
debugf("SYNC Cluster %p is stopping its sync loop.", cluster)
}
func (cluster *mongoCluster) server(addr string, tcpaddr *net.TCPAddr) *mongoServer {
cluster.RLock()
server := cluster.servers.Search(tcpaddr.String())
cluster.RUnlock()
if server != nil {
return server
}
return newServer(addr, tcpaddr, cluster.sync, cluster.dial)
}
func resolveAddr(addr string) (*net.TCPAddr, error) {
// Simple cases that do not need actual resolution. Works with IPv4 and v6.
if host, port, err := net.SplitHostPort(addr); err == nil {
if port, _ := strconv.Atoi(port); port > 0 {
zone := ""
if i := strings.LastIndex(host, "%"); i >= 0 {
zone = host[i+1:]
host = host[:i]
}
ip := net.ParseIP(host)
if ip != nil {
return &net.TCPAddr{IP: ip, Port: port, Zone: zone}, nil
}
}
}
// Attempt to resolve IPv4 and v6 concurrently.
addrChan := make(chan *net.TCPAddr, 2)
for _, network := range []string{"udp4", "udp6"} {
network := network
go func() {
// The unfortunate UDP dialing hack allows having a timeout on address resolution.
conn, err := net.DialTimeout(network, addr, 10*time.Second)
if err != nil {
addrChan <- nil
} else {
addrChan <- (*net.TCPAddr)(conn.RemoteAddr().(*net.UDPAddr))
conn.Close()
}
}()
}
// Wait for the result of IPv4 and v6 resolution. Use IPv4 if available.
tcpaddr := <-addrChan
if tcpaddr == nil || len(tcpaddr.IP) != 4 {
var timeout <-chan time.Time
if tcpaddr != nil {
// Don't wait too long if an IPv6 address is known.
timeout = time.After(50 * time.Millisecond)
}
select {
case <-timeout:
case tcpaddr2 := <-addrChan:
if tcpaddr == nil || tcpaddr2 != nil {
// It's an IPv4 address or the only known address. Use it.
tcpaddr = tcpaddr2
}
}
}
if tcpaddr == nil {
log("SYNC Failed to resolve server address: ", addr)
return nil, errors.New("failed to resolve server address: " + addr)
}
if tcpaddr.String() != addr {
debug("SYNC Address ", addr, " resolved as ", tcpaddr.String())
}
return tcpaddr, nil
}
type pendingAdd struct {
server *mongoServer
info *mongoServerInfo
}
func (cluster *mongoCluster) syncServersIteration(direct bool) {
log("SYNC Starting full topology synchronization...")
var wg sync.WaitGroup
var m sync.Mutex
notYetAdded := make(map[string]pendingAdd)
addIfFound := make(map[string]bool)
seen := make(map[string]bool)
syncKind := partialSync
var spawnSync func(addr string, byMaster bool)
spawnSync = func(addr string, byMaster bool) {
wg.Add(1)
go func() {
defer wg.Done()
tcpaddr, err := resolveAddr(addr)
if err != nil {
log("SYNC Failed to start sync of ", addr, ": ", err.Error())
return
}
resolvedAddr := tcpaddr.String()
m.Lock()
if byMaster {
if pending, ok := notYetAdded[resolvedAddr]; ok {
delete(notYetAdded, resolvedAddr)
m.Unlock()
cluster.addServer(pending.server, pending.info, completeSync)
return
}
addIfFound[resolvedAddr] = true
}
if seen[resolvedAddr] {
m.Unlock()
return
}
seen[resolvedAddr] = true
m.Unlock()
server := cluster.server(addr, tcpaddr)
info, hosts, err := cluster.syncServer(server)
if err != nil {
cluster.removeServer(server)
return
}
m.Lock()
add := direct || info.Master || addIfFound[resolvedAddr]
if add {
syncKind = completeSync
} else {
notYetAdded[resolvedAddr] = pendingAdd{server, info}
}
m.Unlock()
if add {
cluster.addServer(server, info, completeSync)
}
if !direct {
for _, addr := range hosts {
spawnSync(addr, info.Master)
}
}
}()
}
knownAddrs := cluster.getKnownAddrs()
for _, addr := range knownAddrs {
spawnSync(addr, false)
}
wg.Wait()
if syncKind == completeSync {
logf("SYNC Synchronization was complete (got data from primary).")
for _, pending := range notYetAdded {
cluster.removeServer(pending.server)
}
} else {
logf("SYNC Synchronization was partial (cannot talk to primary).")
for _, pending := range notYetAdded {
cluster.addServer(pending.server, pending.info, partialSync)
}
}
cluster.Lock()
mastersLen := cluster.masters.Len()
logf("SYNC Synchronization completed: %d master(s) and %d slave(s) alive.", mastersLen, cluster.servers.Len()-mastersLen)
// Update dynamic seeds, but only if we have any good servers. Otherwise,
// leave them alone for better chances of a successful sync in the future.
if syncKind == completeSync {
dynaSeeds := make([]string, cluster.servers.Len())
for i, server := range cluster.servers.Slice() {
dynaSeeds[i] = server.Addr
}
cluster.dynaSeeds = dynaSeeds
debugf("SYNC New dynamic seeds: %#v\n", dynaSeeds)
}
cluster.Unlock()
}
// AcquireSocket returns a socket to a server in the cluster. If slaveOk is
// true, it will attempt to return a socket to a slave server. If it is
// false, the socket will necessarily be to a master server.
func (cluster *mongoCluster) AcquireSocket(mode Mode, slaveOk bool, syncTimeout time.Duration, socketTimeout time.Duration, serverTags []bson.D, poolLimit int) (s *mongoSocket, err error) {
var started time.Time
var syncCount uint
warnedLimit := false
for {
cluster.RLock()
for {
mastersLen := cluster.masters.Len()
slavesLen := cluster.servers.Len() - mastersLen
debugf("Cluster has %d known masters and %d known slaves.", mastersLen, slavesLen)
if !(slaveOk && mode == Secondary) && mastersLen > 0 || slaveOk && slavesLen > 0 {
break
}
if started.IsZero() {
// Initialize after fast path above.
started = time.Now()
syncCount = cluster.syncCount
} else if syncTimeout != 0 && started.Before(time.Now().Add(-syncTimeout)) || cluster.failFast && cluster.syncCount != syncCount {
cluster.RUnlock()
return nil, errors.New("no reachable servers")
}
log("Waiting for servers to synchronize...")
cluster.syncServers()
// Remember: this will release and reacquire the lock.
cluster.serverSynced.Wait()
}
var server *mongoServer
if slaveOk {
server = cluster.servers.BestFit(mode, serverTags)
} else {
server = cluster.masters.BestFit(mode, nil)
}
cluster.RUnlock()
if server == nil {
// Must have failed the requested tags. Sleep to avoid spinning.
time.Sleep(1e8)
continue
}
s, abended, err := server.AcquireSocket(poolLimit, socketTimeout)
if err == errPoolLimit {
if !warnedLimit {
warnedLimit = true
log("WARNING: Per-server connection limit reached.")
}
time.Sleep(100 * time.Millisecond)
continue
}
if err != nil {
cluster.removeServer(server)
cluster.syncServers()
continue
}
if abended && !slaveOk {
var result isMasterResult
err := cluster.isMaster(s, &result)
if err != nil || !result.IsMaster {
logf("Cannot confirm server %s as master (%v)", server.Addr, err)
s.Release()
cluster.syncServers()
time.Sleep(100 * time.Millisecond)
continue
}
}
return s, nil
}
panic("unreached")
}
func (cluster *mongoCluster) CacheIndex(cacheKey string, exists bool) {
cluster.Lock()
if cluster.cachedIndex == nil {
cluster.cachedIndex = make(map[string]bool)
}
if exists {
cluster.cachedIndex[cacheKey] = true
} else {
delete(cluster.cachedIndex, cacheKey)
}
cluster.Unlock()
}
func (cluster *mongoCluster) HasCachedIndex(cacheKey string) (result bool) {
cluster.RLock()
if cluster.cachedIndex != nil {
result = cluster.cachedIndex[cacheKey]
}
cluster.RUnlock()
return
}
func (cluster *mongoCluster) ResetIndexCache() {
cluster.Lock()
cluster.cachedIndex = make(map[string]bool)
cluster.Unlock()
}