Details
-
Type:
Bug
-
Status:
Open
-
Priority:
Major
-
Resolution: Unresolved
-
Affects Version/s: 2.0.1
-
Fix Version/s: 2.1
-
Component/s: couchbase-bucket, ns_server
-
Security Level: Public
-
Labels:None
-
Environment:2.0.1-156-rel
Centos
7 : 5
Source: 7 nodes
all 4-core, 30G SSDs
Destination: 5 nodes
3 4-core, 30G SSDs
2 8-core, 30G SSDs
default: source --> destination
saslbucket: source --> destination
Description
Current status on buckets:
default ~80M items
saslbucket ~65M items
- 1 ongoing replication for each of the buckets
- Mixed load (creates-updates-deletes-expirations) running on both nodes
- 1 Production view under a design doc for default on the source cluster
- Continuously running queries on the view as well
Live cluster: (Uptime >24 hours)
Source: http://10.6.2.37:8091
Destination: http://10.6.2.45:8091
- Rebalance operation running:
Starting rebalance, KeepNodes = ['ns_1@10.6.2.37','ns_1@10.6.2.38',
'ns_1@10.6.2.39','ns_1@10.6.2.40',
'ns_1@10.6.2.42'], EjectNodes = ['ns_1@10.6.2.43',
'ns_1@10.6.2.44']
- Rebalance operation very slow because:
Indexing default/_design/d1
Compacting bucket default
Compacting bucket saslbucket
Compacting bucket saslbucket
While rebalance is running, saw the following the message atleast a couple of times:
"Could not auto-failover node ('ns_1@10.6.2.38'). There was at least another node down."
"Could not automatically failover node 'ns_1@10.6.2.38' because I think rebalance is running"
And then:
- Couchbase server on Node 10.6.2.39 goes down
- Rebalance exits
Rebalance exited with reason {bulk_set_vbucket_state_failed,
[{'ns_1@10.6.2.39',
{'EXIT',
{{nodedown,'ns_1@10.6.2.39'},
{gen_server,call,
[{'janitor_agent-saslbucket',
'ns_1@10.6.2.39'},
{if_rebalance,<0.7863.100>,
{update_vbucket_state,918,replica,
undefined,undefined}},
infinity]}}}}]}
- Node 10.6.2.38 gets auto-failed-over, with the following log entry:
Node ('ns_1@10.6.2.39') was automatically failovered.
[down,stale,
{last_heard,{1360,973486,296813}},
{outgoing_replications_safeness_level,
[{"saslbucket",stale},{"default",green}]},
{incoming_replications_conf_hashes,
[{"saslbucket",
[{'ns_1@10.6.2.37',7942524},
{'ns_1@10.6.2.38',130938511},
{'ns_1@10.6.2.40',131674787},
{'ns_1@10.6.2.42',99738451},
{'ns_1@10.6.2.43',85152916},
{'ns_1@10.6.2.44',119351330}]},
{"default",
[{'ns_1@10.6.2.37',80548424},
{'ns_1@10.6.2.38',99434638},
{'ns_1@10.6.2.40',41181054},
{'ns_1@10.6.2.42',75330863},
{'ns_1@10.6.2.43',104165652},
{'ns_1@10.6.2.44',55133429}]}]},
{active_buckets,["saslbucket","default"]},
{ready_buckets,["saslbucket","default"]},
{local_tasks,
[[{pid,<<"<0.15978.88>">>},
{changes_done,519741},
{design_documents,[<<"_design/d1">>]},
{indexer_type,replica},
{initial_build,false},
{progress,100},
{set,<<"default">>},
{signature,<<"cf1ae2783bd44c07b46c3cac242842a5">>},
{started_on,1360953239},
{total_changes,519741},
{type,indexer},
{updated_on,1360972465}],
[{pid,<<"<0.21626.100>">>},
{changes_done,0},
{design_documents,[<<"_design/d1">>]},
{indexer_type,main},
{initial_build,false},
{progress,0},
{set,<<"default">>},
{signature,<<"cf1ae2783bd44c07b46c3cac242842a5">>},
{started_on,1360966193},
{total_changes,5784547},
{type,indexer},
{updated_on,1360966193}],
[{pid,<<"<0.20399.103>">>},
{bucket,<<"default">>},
{original_target,{[{type,bucket}]}},
{progress,52},
{started_on,1360973268},
{total_vbuckets,294},
{trigger_type,scheduled},
{type,bucket_compaction},
{updated_on,1360973367},
{vbuckets_done,153}],
[{type,xdcr},
{id,<<"e106ec063395f02c97dbba63a247cfad/saslbucket/saslbucket">>},
{errors,[]},
{changes_left,7131130},
{docs_checked,24571336},
{docs_written,19190871},
{data_replicated,14022432192},
{active_vbreps,32},
{waiting_vbreps,144},
{time_working,2828014},
{time_committing,4754},
{num_checkpoints,2417},
{num_failedckpts,62},
{docs_rep_queue,90071},
{size_rep_queue,9075615}],
[{type,xdcr},
{id,<<"e106ec063395f02c97dbba63a247cfad/default/default">>},
{errors,[]},
{changes_left,3699342},
{docs_checked,26077879},
{docs_written,22400220},
{data_replicated,16439871268},
{active_vbreps,32},
{waiting_vbreps,114},
{time_working,3222330},
{time_committing,5285},
{num_checkpoints,2875},
{num_failedckpts,75},
{docs_rep_queue,38011},
{size_rep_queue,3849309}]]},
{memory,
[{total,1467390560},
{processes,1242860128},
{processes_used,1242396472},
{system,224530432},
{atom,1506713},
{atom_used,1501102},
{binary,137844480},
{code,15673585},
{ets,57752440}]},
{system_memory_data,
[{system_total_memory,32745521152},
{free_swap,2119372800},
{total_swap,5368700928},
{cached_memory,13826760704},
{buffered_memory,14364672},
{free_memory,331780096},
{total_memory,32745521152}]},
{node_storage_conf,[{db_path,"/data"},{index_path,"/index"}]},
{statistics,
[{wall_clock,{169038234,13826}},
{context_switches,{1855663220,0}},
{garbage_collection,{176172403,3879508106118,0}},
{io,{{input,2517880361702},{output,239493307889}}},
{reductions,{558661606055,22017720}},
{run_queue,118},
{runtime,{149380120,7780}}]},
{system_stats,
[{cpu_utilization_rate,50.67178502879079},
{swap_total,5368700928},
{swap_used,3250393088}]},
{interesting_stats,
[{couch_docs_actual_disk_size,70063578847},
{couch_docs_data_size,28333503657},
{couch_views_actual_disk_size,2013908118},
{couch_views_data_size,535661359},
{curr_items,23294623},
{curr_items_tot,46790346},
{mem_used,13864234792},
{vb_replica_curr_items,23495723}]},
{cluster_compatibility_version,131072},
{version,
[{public_key,"0.13"},
{lhttpc,"1.3.0"},
{ale,"8cffe61"},
{os_mon,"2.2.7"},
{couch_set_view,"1.2.0a-c6e7157-git"},
{mnesia,"4.5"},
{inets,"5.7.1"},
{couch,"1.2.0a-c6e7157-git"},
{mapreduce,"1.0.0"},
{couch_index_merger,"1.2.0a-c6e7157-git"},
{kernel,"2.14.5"},
{crypto,"2.0.4"},
{ssl,"4.1.6"},
{sasl,"2.1.10"},
{couch_view_parser,"1.0.0"},
{ns_server,"2.0.1-156-rel-enterprise"},
{mochiweb,"1.4.1"},
{oauth,"7d85d3ef"},
{stdlib,"1.17.5"}]},
{supported_compat_version,[2,0]},
{system_arch,"x86_64-unknown-linux-gnu"},
{wall_clock,169038},
{memory_data,{32745521152,32589651968,{<18650.21626.100>,30409072}}},
{disk_data,
[{"/",49064776,16},
{"/dev/shm",15989024,0},
{"/boot",495844,7},
{"/data",243588516,30},
{"/index",123860788,2}]},
{meminfo,
<<"MemTotal: 31978048 kB\nMemFree: 152876 kB\nBuffers: 14232 kB\nCached: 13468780 kB\nSwapCached: 284200 kB\nActive: 20652152 kB\nInactive: 7798348 kB\nActive(anon): 13935296 kB\nInactive(anon): 1074432 kB\nActive(file): 6716856 kB\nInactive(file): 6723916 kB\nUnevictable: 2596820 kB\nMlocked: 2596912 kB\nSwapTotal: 5242872 kB\nSwapFree: 2068752 kB\nDirty: 4864 kB\nWriteback: 0 kB\nAnonPages: 17289872 kB\nMapped: 45316 kB\nShmem: 0 kB\nSlab: 398332 kB\nSReclaimable: 343232 kB\nSUnreclaim: 55100 kB\nKernelStack: 1560 kB\nPageTables: 43464 kB\nNFS_Unstable: 0 kB\nBounce: 0 kB\nWritebackTmp: 0 kB\nCommitLimit: 21231896 kB\nCommitted_AS: 20903176 kB\nVmallocTotal: 34359738367 kB\nVmallocUsed: 64624 kB\nVmallocChunk: 34359662900 kB\nHardwareCorrupted: 0 kB\nAnonHugePages: 0 kB\nHugePages_Total: 0\nHugePages_Free: 0\nHugePages_Rsvd: 0\nHugePages_Surp: 0\nHugepagesize: 2048 kB\nDirectMap4k: 32768000 kB\nDirectMap2M: 0 kB\n">>}]
- No cores on 10.6.2.39
18716 couchbas 20 0 4367m 1.9g 40m S 99.7 6.3 4314:33 beam.smp
19115 couchbas 20 0 17.4g 14g 2616 S 12.6 47.0 714:26.47 memcached
- Unable to generate diagnostic report on the source cluster.
default ~80M items
saslbucket ~65M items
- 1 ongoing replication for each of the buckets
- Mixed load (creates-updates-deletes-expirations) running on both nodes
- 1 Production view under a design doc for default on the source cluster
- Continuously running queries on the view as well
Live cluster: (Uptime >24 hours)
Source: http://10.6.2.37:8091
Destination: http://10.6.2.45:8091
- Rebalance operation running:
Starting rebalance, KeepNodes = ['ns_1@10.6.2.37','ns_1@10.6.2.38',
'ns_1@10.6.2.39','ns_1@10.6.2.40',
'ns_1@10.6.2.42'], EjectNodes = ['ns_1@10.6.2.43',
'ns_1@10.6.2.44']
- Rebalance operation very slow because:
Indexing default/_design/d1
Compacting bucket default
Compacting bucket saslbucket
Compacting bucket saslbucket
While rebalance is running, saw the following the message atleast a couple of times:
"Could not auto-failover node ('ns_1@10.6.2.38'). There was at least another node down."
"Could not automatically failover node 'ns_1@10.6.2.38' because I think rebalance is running"
And then:
- Couchbase server on Node 10.6.2.39 goes down
- Rebalance exits
Rebalance exited with reason {bulk_set_vbucket_state_failed,
[{'ns_1@10.6.2.39',
{'EXIT',
{{nodedown,'ns_1@10.6.2.39'},
{gen_server,call,
[{'janitor_agent-saslbucket',
'ns_1@10.6.2.39'},
{if_rebalance,<0.7863.100>,
{update_vbucket_state,918,replica,
undefined,undefined}},
infinity]}}}}]}
- Node 10.6.2.38 gets auto-failed-over, with the following log entry:
Node ('ns_1@10.6.2.39') was automatically failovered.
[down,stale,
{last_heard,{1360,973486,296813}},
{outgoing_replications_safeness_level,
[{"saslbucket",stale},{"default",green}]},
{incoming_replications_conf_hashes,
[{"saslbucket",
[{'ns_1@10.6.2.37',7942524},
{'ns_1@10.6.2.38',130938511},
{'ns_1@10.6.2.40',131674787},
{'ns_1@10.6.2.42',99738451},
{'ns_1@10.6.2.43',85152916},
{'ns_1@10.6.2.44',119351330}]},
{"default",
[{'ns_1@10.6.2.37',80548424},
{'ns_1@10.6.2.38',99434638},
{'ns_1@10.6.2.40',41181054},
{'ns_1@10.6.2.42',75330863},
{'ns_1@10.6.2.43',104165652},
{'ns_1@10.6.2.44',55133429}]}]},
{active_buckets,["saslbucket","default"]},
{ready_buckets,["saslbucket","default"]},
{local_tasks,
[[{pid,<<"<0.15978.88>">>},
{changes_done,519741},
{design_documents,[<<"_design/d1">>]},
{indexer_type,replica},
{initial_build,false},
{progress,100},
{set,<<"default">>},
{signature,<<"cf1ae2783bd44c07b46c3cac242842a5">>},
{started_on,1360953239},
{total_changes,519741},
{type,indexer},
{updated_on,1360972465}],
[{pid,<<"<0.21626.100>">>},
{changes_done,0},
{design_documents,[<<"_design/d1">>]},
{indexer_type,main},
{initial_build,false},
{progress,0},
{set,<<"default">>},
{signature,<<"cf1ae2783bd44c07b46c3cac242842a5">>},
{started_on,1360966193},
{total_changes,5784547},
{type,indexer},
{updated_on,1360966193}],
[{pid,<<"<0.20399.103>">>},
{bucket,<<"default">>},
{original_target,{[{type,bucket}]}},
{progress,52},
{started_on,1360973268},
{total_vbuckets,294},
{trigger_type,scheduled},
{type,bucket_compaction},
{updated_on,1360973367},
{vbuckets_done,153}],
[{type,xdcr},
{id,<<"e106ec063395f02c97dbba63a247cfad/saslbucket/saslbucket">>},
{errors,[]},
{changes_left,7131130},
{docs_checked,24571336},
{docs_written,19190871},
{data_replicated,14022432192},
{active_vbreps,32},
{waiting_vbreps,144},
{time_working,2828014},
{time_committing,4754},
{num_checkpoints,2417},
{num_failedckpts,62},
{docs_rep_queue,90071},
{size_rep_queue,9075615}],
[{type,xdcr},
{id,<<"e106ec063395f02c97dbba63a247cfad/default/default">>},
{errors,[]},
{changes_left,3699342},
{docs_checked,26077879},
{docs_written,22400220},
{data_replicated,16439871268},
{active_vbreps,32},
{waiting_vbreps,114},
{time_working,3222330},
{time_committing,5285},
{num_checkpoints,2875},
{num_failedckpts,75},
{docs_rep_queue,38011},
{size_rep_queue,3849309}]]},
{memory,
[{total,1467390560},
{processes,1242860128},
{processes_used,1242396472},
{system,224530432},
{atom,1506713},
{atom_used,1501102},
{binary,137844480},
{code,15673585},
{ets,57752440}]},
{system_memory_data,
[{system_total_memory,32745521152},
{free_swap,2119372800},
{total_swap,5368700928},
{cached_memory,13826760704},
{buffered_memory,14364672},
{free_memory,331780096},
{total_memory,32745521152}]},
{node_storage_conf,[{db_path,"/data"},{index_path,"/index"}]},
{statistics,
[{wall_clock,{169038234,13826}},
{context_switches,{1855663220,0}},
{garbage_collection,{176172403,3879508106118,0}},
{io,{{input,2517880361702},{output,239493307889}}},
{reductions,{558661606055,22017720}},
{run_queue,118},
{runtime,{149380120,7780}}]},
{system_stats,
[{cpu_utilization_rate,50.67178502879079},
{swap_total,5368700928},
{swap_used,3250393088}]},
{interesting_stats,
[{couch_docs_actual_disk_size,70063578847},
{couch_docs_data_size,28333503657},
{couch_views_actual_disk_size,2013908118},
{couch_views_data_size,535661359},
{curr_items,23294623},
{curr_items_tot,46790346},
{mem_used,13864234792},
{vb_replica_curr_items,23495723}]},
{cluster_compatibility_version,131072},
{version,
[{public_key,"0.13"},
{lhttpc,"1.3.0"},
{ale,"8cffe61"},
{os_mon,"2.2.7"},
{couch_set_view,"1.2.0a-c6e7157-git"},
{mnesia,"4.5"},
{inets,"5.7.1"},
{couch,"1.2.0a-c6e7157-git"},
{mapreduce,"1.0.0"},
{couch_index_merger,"1.2.0a-c6e7157-git"},
{kernel,"2.14.5"},
{crypto,"2.0.4"},
{ssl,"4.1.6"},
{sasl,"2.1.10"},
{couch_view_parser,"1.0.0"},
{ns_server,"2.0.1-156-rel-enterprise"},
{mochiweb,"1.4.1"},
{oauth,"7d85d3ef"},
{stdlib,"1.17.5"}]},
{supported_compat_version,[2,0]},
{system_arch,"x86_64-unknown-linux-gnu"},
{wall_clock,169038},
{memory_data,{32745521152,32589651968,{<18650.21626.100>,30409072}}},
{disk_data,
[{"/",49064776,16},
{"/dev/shm",15989024,0},
{"/boot",495844,7},
{"/data",243588516,30},
{"/index",123860788,2}]},
{meminfo,
<<"MemTotal: 31978048 kB\nMemFree: 152876 kB\nBuffers: 14232 kB\nCached: 13468780 kB\nSwapCached: 284200 kB\nActive: 20652152 kB\nInactive: 7798348 kB\nActive(anon): 13935296 kB\nInactive(anon): 1074432 kB\nActive(file): 6716856 kB\nInactive(file): 6723916 kB\nUnevictable: 2596820 kB\nMlocked: 2596912 kB\nSwapTotal: 5242872 kB\nSwapFree: 2068752 kB\nDirty: 4864 kB\nWriteback: 0 kB\nAnonPages: 17289872 kB\nMapped: 45316 kB\nShmem: 0 kB\nSlab: 398332 kB\nSReclaimable: 343232 kB\nSUnreclaim: 55100 kB\nKernelStack: 1560 kB\nPageTables: 43464 kB\nNFS_Unstable: 0 kB\nBounce: 0 kB\nWritebackTmp: 0 kB\nCommitLimit: 21231896 kB\nCommitted_AS: 20903176 kB\nVmallocTotal: 34359738367 kB\nVmallocUsed: 64624 kB\nVmallocChunk: 34359662900 kB\nHardwareCorrupted: 0 kB\nAnonHugePages: 0 kB\nHugePages_Total: 0\nHugePages_Free: 0\nHugePages_Rsvd: 0\nHugePages_Surp: 0\nHugepagesize: 2048 kB\nDirectMap4k: 32768000 kB\nDirectMap2M: 0 kB\n">>}]
- No cores on 10.6.2.39
18716 couchbas 20 0 4367m 1.9g 40m S 99.7 6.3 4314:33 beam.smp
19115 couchbas 20 0 17.4g 14g 2616 S 12.6 47.0 714:26.47 memcached
- Unable to generate diagnostic report on the source cluster.