<!-- 
RSS generated by JIRA (5.2.4#845-sha1:c9f4cc41abe72fb236945343a1f485c2c844dac9) at Sun May 19 23:53:05 CDT 2013

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary add field=key&field=summary to the URL of your request.
For example:
http://www.couchbase.com/issues/si/jira.issueviews:issue-xml/MB-7199/MB-7199.xml?field=key&field=summary
-->
<rss version="0.92" >
<channel>
    <title>Couchbase</title>
    <link>http://www.couchbase.com/issues</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>5.2.4</version>
        <build-number>845</build-number>
        <build-date>26-12-2012</build-date>
    </build-info>

<item>
            <title>[MB-7199] Couchbase server can&apos;t handle hundreds of view queries with unlimited number of results at the same time</title>
                <link>http://www.couchbase.com/issues/browse/MB-7199</link>
                <project id="10010" key="MB">Couchbase Server</project>
                        <description>Cluster: 6 nodes&lt;br/&gt;
10.6.2.37&lt;br/&gt;
10.6.2.38&lt;br/&gt;
10.6.2.39&lt;br/&gt;
10.6.2.40&lt;br/&gt;
10.6.2.42&lt;br/&gt;
10.6.2.43&lt;br/&gt;
&lt;br/&gt;
Build # 2.0.0-1952 with 16 erlang schedulers&lt;br/&gt;
each nodes with 390GB SSD drive, 32GB RAM&lt;br/&gt;
&lt;br/&gt;
2 buckets created sasl and default. Start loading items with 8K creates per sec to each bucket. Then insert a ddoc with 2 views to each bucket. Then have 4 clients do query for the view with 120 reads per sec. &lt;br/&gt;
&lt;br/&gt;
I don&amp;#39;t put any limit on query results and those queries are generated without waiting for previous ones finish showing the results:&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;capiUrl = &amp;quot;&lt;a href=&quot;http://%s:%s/couchBase/&quot;&gt;http://%s:%s/couchBase/&lt;/a&gt;&amp;quot; % (cfg.COUCHBASE_IP, cfg.COUCHBASE_PORT)&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;url = capiUrl + &amp;#39;%s/_design/%s/_%s/%s&amp;#39; % (bucket,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;design_doc_name, type_,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;view_name)&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;headers = {&amp;#39;Content-Type&amp;#39;: &amp;#39;application/json&amp;#39;,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;#39;Authorization&amp;#39;: &amp;#39;Basic %s&amp;#39; % authorization,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;#39;Accept&amp;#39;: &amp;#39;*/*&amp;#39;}&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;req = urllib2.Request(url, headers = headers)&lt;br/&gt;
&lt;br/&gt;
Then the UI becomes unresponsive. &lt;br/&gt;
Pay attention to the following stats:&lt;br/&gt;
&lt;br/&gt;
1st is the erlang scheduler on one of the nodes during query happens:&lt;br/&gt;
&lt;br/&gt;
(&lt;a href=&apos;mailto:ns_1@10.6.2.37&apos;&gt;ns_1@10.6.2.37&lt;/a&gt;)5&amp;gt; F = fun (R) -&amp;gt; io:format(&amp;quot;~p ~p~n&amp;quot;, [latency:ts(now()), erlang:statistics(run_queues)]), timer:sleep(100), R(R) end.&lt;br/&gt;
#Fun&amp;lt;erl_eval.6.80247286&amp;gt;&lt;br/&gt;
1353032384137 {11,104,2,0,8,11,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032384293 {4,65,103,7,2,20,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032384425 {3,7,4,25,21,3,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032384553 {23,17,50,6,6,0,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032384672 {16,28,92,15,65,42,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032384795 {6,4,47,15,1,0,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032384919 {1,11,86,59,56,55,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032385081 {54,49,30,44,33,11,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032385221 {15,47,10,45,9,31,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032385355 {46,2,72,89,28,4,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032385468 {11,1,8,26,0,2,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032385610 {7,23,7,14,20,13,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032385765 {7,85,11,16,0,12,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032385905 {9,29,28,2,3,26,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032386068 {48,112,142,31,12,25,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032386222 {11,40,28,36,5,9,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032386356 {64,53,4,5,7,34,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032386560 {0,2,45,2,0,89,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032386700 {50,18,83,4,0,35,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032386837 {0,18,3,2,17,4,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032386984 {2,10,11,6,0,4,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032387105 {1,5,12,2,0,64,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032387231 {22,67,58,5,19,7,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032387337 {17,1,38,33,7,1,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032387469 {5,5,48,27,2,18,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032387598 {2,50,47,88,41,8,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032387746 {2,55,16,35,1,12,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032387897 {3,29,98,0,5,19,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032388021 {29,50,147,0,5,3,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032388146 {15,3,30,3,46,2,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032388277 {53,8,50,1,10,14,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032388402 {2,19,45,0,6,2,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032388594 {17,123,2,0,29,4,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032388734 {35,92,0,3,40,70,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032388873 {2,10,22,5,18,17,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032389008 {112,84,15,0,1,0,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032389133 {102,57,0,25,3,23,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032389257 {44,55,28,5,36,49,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032389379 {4,40,3,48,2,48,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032389549 {24,161,24,38,16,21,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032389686 {54,25,12,23,7,98,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032389804 {79,33,20,2,3,46,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032389950 {90,0,25,13,45,56,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
1353032390101 {59,10,17,1,37,54,0,0,0,0,0,0,0,0,0,0}&lt;br/&gt;
&lt;br/&gt;
2nd is the top stats about beam.smp:&lt;br/&gt;
&amp;nbsp;&amp;nbsp;PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND                                                                                                    &lt;br/&gt;
&amp;nbsp;&amp;nbsp;676 couchbas  20   0 26.0g  24g 5128 S 663.9 77.7 369:01.85 beam.smp     &lt;br/&gt;
&lt;br/&gt;
24G memory usage. And the CPU% is always above 350% </description>
                <environment>centos5.6, build 1952 with 16 scheduler</environment>
            <key id="20798">MB-7199</key>
            <summary>Couchbase server can&apos;t handle hundreds of view queries with unlimited number of results at the same time</summary>
                <type id="1" iconUrl="http://www.couchbase.com/issues/images/icons/issuetypes/bug.png">Bug</type>
                                <priority id="3" iconUrl="http://www.couchbase.com/issues/images/icons/priorities/major.png">Major</priority>
                    <status id="1" iconUrl="http://www.couchbase.com/issues/images/icons/statuses/open.png">Open</status>
                    <resolution id="-1">Unresolved</resolution>
                    <security id="10011">Public</security>
                        <assignee username="Aliaksey Artamonau">Aliaksey Artamonau</assignee>
                                <reporter username="Chisheng">Chisheng Hong</reporter>
                        <labels>
                        <label>2.0-release-notes</label>
                        <label>system-test</label>
                    </labels>
                <created>Thu, 15 Nov 2012 22:21:10 -0600</created>
                <updated>Wed, 27 Mar 2013 17:00:45 -0500</updated>
                                    <version>2.0</version>
                <version>2.0.1</version>
                <version>2.0.2</version>
                                <fixVersion>2.1</fixVersion>
                                <component>ns_server</component>
                <component>view-engine</component>
                                <votes>0</votes>
                        <watches>3</watches>
                                                    <comments>
                    <comment id="44175" author="Chisheng" created="Thu, 15 Nov 2012 22:42:34 -0600"  >Do cbcollect_info for every node:&lt;br/&gt;
&lt;a href=&quot;https://s3.amazonaws.com/bugdb/jira/MB-7199/11-15/10.6.2.38.zip&quot;&gt;https://s3.amazonaws.com/bugdb/jira/MB-7199/11-15/10.6.2.38.zip&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://s3.amazonaws.com/bugdb/jira/MB-7199/11-15/10.6.2.39.zip&quot;&gt;https://s3.amazonaws.com/bugdb/jira/MB-7199/11-15/10.6.2.39.zip&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://s3.amazonaws.com/bugdb/jira/MB-7199/11-15/10.6.2.40.zip&quot;&gt;https://s3.amazonaws.com/bugdb/jira/MB-7199/11-15/10.6.2.40.zip&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://s3.amazonaws.com/bugdb/jira/MB-7199/11-15/10.6.2.42.zip&quot;&gt;https://s3.amazonaws.com/bugdb/jira/MB-7199/11-15/10.6.2.42.zip&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://s3.amazonaws.com/bugdb/jira/MB-7199/11-15/10.6.2.43.zip&quot;&gt;https://s3.amazonaws.com/bugdb/jira/MB-7199/11-15/10.6.2.43.zip&lt;/a&gt;&lt;br/&gt;
&lt;br/&gt;
</comment>
                    <comment id="44214" author="tommie" created="Fri, 16 Nov 2012 15:15:04 -0600"  >Same issue on different cluster with ssd at build 1952:&lt;br/&gt;
Went from 35k op/sec kv workload, to 2k op/sec and unresponsive ui when applied 50 queries per second.  Only querying a subset of documents (about 800k)&lt;br/&gt;
Load ratio: -create 10 --get 75 --update 15 --delete 1 &lt;br/&gt;
&lt;br/&gt;
In attached screen see direct correlation between op/sec and view reads.&lt;br/&gt;
&lt;br/&gt;
In atop beam.smp is 5GB and seems to grow without bounds. To replay samples ssh &lt;a href=&apos;mailto:root@10.6.2.66&apos;&gt;root@10.6.2.66&lt;/a&gt;   (pwd: couchbase)&lt;br/&gt;
atop -b 12:40 -r /tmp/atop-nodeMB-7199.log&lt;br/&gt;
&lt;br/&gt;
Also In a previous run beam needed to be killed by OS&lt;br/&gt;
&amp;gt; Nov 15 10:11:48 pine-11803 kernel: Out of memory: Kill process 32588 (beam.smp) score 962 or sacrifice child&lt;br/&gt;
</comment>
                    <comment id="44215" author="alkondratenko" created="Fri, 16 Nov 2012 16:10:03 -0600"  >Folks, please try testing with default settings. Let&amp;#39;s confirm it&amp;#39;s not just code regression</comment>
                    <comment id="44219" author="alkondratenko" created="Fri, 16 Nov 2012 16:47:48 -0600"  >Passing this formally to Aliaksey who is handling this anyways.&lt;br/&gt;
&lt;br/&gt;
We&amp;#39;ve started looking at partially ready crash dump and there are signs of real leak of lots of binaries in index merger</comment>
                    <comment id="44247" author="thuan" created="Fri, 16 Nov 2012 19:55:35 -0600"  >Integrated in github-couchdb-preview #538 (See [&lt;a href=&quot;http://qa.hq.northscale.net/job/github-couchdb-preview/538/&quot;&gt;http://qa.hq.northscale.net/job/github-couchdb-preview/538/&lt;/a&gt;])&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&lt;a href=&quot;http://www.couchbase.com/issues/browse/MB-7199&quot; title=&quot;Couchbase server can&amp;#39;t handle hundreds of view queries with unlimited number of results at the same time&quot;&gt;MB-7199&lt;/a&gt; Don&amp;#39;t buffer socket data on cleanup (Revision 31560d74c3bbe8c019186923a9db3468a8197ab8)&lt;br/&gt;
&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;Result = SUCCESS&lt;br/&gt;
Farshid Ghods : &lt;br/&gt;
Files : &lt;br/&gt;
* src/couch_index_merger/src/couch_index_merger.erl&lt;br/&gt;
</comment>
                    <comment id="44250" author="Aliaksey Artamonau" created="Fri, 16 Nov 2012 20:54:51 -0600"  >We would like to know as many details about the queries that you run as it&amp;#39;s possible. Particularly, which query parameters are used, iff you read the response completely or just drop the connection on the floor.</comment>
                    <comment id="44251" author="Chisheng" created="Fri, 16 Nov 2012 21:28:09 -0600"  >I still face the same situation with default 128 scheduler with the same scenario.</comment>
                    <comment id="44328" author="steve" created="Mon, 19 Nov 2012 13:21:37 -0600"  >bug-scrub - want same test on build 1953 (which has new fix) - look for crashes/cores</comment>
                    <comment id="44329" author="farshid" created="Mon, 19 Nov 2012 13:21:42 -0600"  >1- run the same test without changing the limit to N - new build with same code , after cluster has same number of items , same number of nodes&lt;br/&gt;
2- what is the timeout for that call ?&lt;br/&gt;
3- see if it still crashes</comment>
                    <comment id="44347" author="Chisheng" created="Mon, 19 Nov 2012 16:28:33 -0600"  >I ran the same test case in bug description with 1953 (16:16) using mcsoda and do query by using os.system(&amp;quot;curl -X GET &amp;#39;&lt;a href=&quot;http://%s%s:8092/%s/_design/d1/_view/v1?stale=ok&amp;#39;&quot;&gt;http://%s%s:8092/%s/_design/d1/_view/v1?stale=ok&amp;amp;#39;&lt;/a&gt;&amp;quot; % (bucket_info, vm_ips[0], bucket)). Same thing happened after I do this unlimited results query more than 1K times (I kick off those queries without caring about whether the previous query finishes showing the results). Memory usage for beam.smp increases above 2.5G very easily. But if we put limit for query, this will not happen.</comment>
                    <comment id="44348" author="FilipeManana" created="Mon, 19 Nov 2012 16:58:21 -0600"  >Where&amp;#39;s the crashdump file?</comment>
                    <comment id="44374" author="FilipeManana" created="Mon, 19 Nov 2012 20:13:56 -0600"  >My findings after analyzing crash dump file (~300Mb):&lt;br/&gt;
&lt;br/&gt;
When the crash dump was produced, the memory used by Erlang VM was about:&lt;br/&gt;
&lt;br/&gt;
$ grep -n OldHeap: erl_crash.dump.11-19-2012-13\:16\:17.5856 | cut -d &amp;#39; &amp;#39; -f 2 | perl -ne &amp;#39;$sum = $sum + $_; print $sum, &amp;quot;\n&amp;quot;;&amp;#39; | tail -n 1&lt;br/&gt;
152453922&lt;br/&gt;
&lt;br/&gt;
152453922 * 8 = ~1.2Gb&lt;br/&gt;
&lt;br/&gt;
$ grep -n Stack+heap: erl_crash.dump.11-19-2012-13\:16\:17.5856 | cut -d &amp;#39; &amp;#39; -f 2 | perl -ne &amp;#39;$sum = $sum + $_; print $sum, &amp;quot;\n&amp;quot;;&amp;#39; | tail -n 1&lt;br/&gt;
53623918&lt;br/&gt;
&lt;br/&gt;
53623918 * 8 = ~428Mb&lt;br/&gt;
&lt;br/&gt;
Process (Erlang process) with biggest old heaps:&lt;br/&gt;
&lt;br/&gt;
$ grep -n OldHeap: erl_crash.dump.11-19-2012-13\:16\:17.5856 | sort -k 3 -n -t&lt;br/&gt;
(...)&lt;br/&gt;
23058:OldHeap: 2629425&lt;br/&gt;
10824:OldHeap: 3286780&lt;br/&gt;
227045:OldHeap: 8024355&lt;br/&gt;
&lt;br/&gt;
(values are in words, therefore to get byte values multiply by 8)&lt;br/&gt;
&lt;br/&gt;
These 3 processes are:&lt;br/&gt;
&lt;br/&gt;
Mochiweb server:&lt;br/&gt;
&lt;br/&gt;
=proc:&amp;lt;0.17887.7&amp;gt;&lt;br/&gt;
State: Waiting&lt;br/&gt;
Spawned as: proc_lib:init_p/5&lt;br/&gt;
Spawned by: &amp;lt;0.1003.0&amp;gt;&lt;br/&gt;
Started: Mon Nov 19 15:51:37 2012&lt;br/&gt;
Message queue length: 0&lt;br/&gt;
Number of heap fragments: 0&lt;br/&gt;
Heap fragment data: 0&lt;br/&gt;
Link list: [#Port&amp;lt;0.218061&amp;gt;, &amp;lt;0.1003.0&amp;gt;]&lt;br/&gt;
Reductions: 3278012&lt;br/&gt;
Stack+heap: 1346269&lt;br/&gt;
OldHeap: 8024355&lt;br/&gt;
Heap unused: 192102&lt;br/&gt;
OldHeap unused: 6083229&lt;br/&gt;
Program counter: 0x00007f6829c7b928 (mochiweb_http:request/2 + 88)&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
ns_server stats reader for bucket default&lt;br/&gt;
&lt;br/&gt;
=proc:&amp;lt;0.5381.0&amp;gt;&lt;br/&gt;
State: Waiting&lt;br/&gt;
Name: &amp;#39;stats_reader-default&amp;#39;&lt;br/&gt;
Spawned as: proc_lib:init_p/5&lt;br/&gt;
Spawned by: &amp;lt;0.5353.0&amp;gt;&lt;br/&gt;
Started: Mon Nov 19 13:30:16 2012&lt;br/&gt;
Message queue length: 0&lt;br/&gt;
Number of heap fragments: 0&lt;br/&gt;
Heap fragment data: 0&lt;br/&gt;
Link list: [&amp;lt;0.5353.0&amp;gt;]&lt;br/&gt;
Reductions: 10535894&lt;br/&gt;
Stack+heap: 1682835&lt;br/&gt;
OldHeap: 3286780&lt;br/&gt;
Heap unused: 125117&lt;br/&gt;
OldHeap unused: 2875168&lt;br/&gt;
Program counter: 0x00007f683c076840 (gen_server:loop/6 + 256)&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
ns_server stats reader for bucket saslbucket&lt;br/&gt;
&lt;br/&gt;
=proc:&amp;lt;0.7758.0&amp;gt;&lt;br/&gt;
State: Waiting&lt;br/&gt;
Name: &amp;#39;stats_reader-saslbucket&amp;#39;&lt;br/&gt;
Spawned as: proc_lib:init_p/5&lt;br/&gt;
Spawned by: &amp;lt;0.7730.0&amp;gt;&lt;br/&gt;
Started: Mon Nov 19 13:30:40 2012&lt;br/&gt;
Message queue length: 0&lt;br/&gt;
Number of heap fragments: 0&lt;br/&gt;
Heap fragment data: 0&lt;br/&gt;
Link list: [&amp;lt;0.7730.0&amp;gt;]&lt;br/&gt;
Reductions: 5999001&lt;br/&gt;
Stack+heap: 1346269&lt;br/&gt;
OldHeap: 2629425&lt;br/&gt;
Heap unused: 392854&lt;br/&gt;
OldHeap unused: 2629326&lt;br/&gt;
Program counter: 0x00007f683c076840 (gen_server:loop/6 + 256)&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
Erlang processes with biggest stack+heap:&lt;br/&gt;
&lt;br/&gt;
$ grep -n Stack+heap: erl_crash.dump.11-19-2012-13\:16\:17.5856 | sort -k 3 -n -t:&lt;br/&gt;
(....)&lt;br/&gt;
227044:Stack+heap: 1346269&lt;br/&gt;
23057:Stack+heap: 1346269&lt;br/&gt;
10823:Stack+heap: 1682835&lt;br/&gt;
&lt;br/&gt;
ns_server&amp;#39;s stats reader for default bucket:&lt;br/&gt;
&lt;br/&gt;
=proc:&amp;lt;0.5381.0&amp;gt;&lt;br/&gt;
State: Waiting&lt;br/&gt;
Name: &amp;#39;stats_reader-default&amp;#39;&lt;br/&gt;
Spawned as: proc_lib:init_p/5&lt;br/&gt;
Spawned by: &amp;lt;0.5353.0&amp;gt;&lt;br/&gt;
Started: Mon Nov 19 13:30:16 2012&lt;br/&gt;
Message queue length: 0&lt;br/&gt;
Number of heap fragments: 0&lt;br/&gt;
Heap fragment data: 0&lt;br/&gt;
Link list: [&amp;lt;0.5353.0&amp;gt;]&lt;br/&gt;
Reductions: 10535894&lt;br/&gt;
Stack+heap: 1682835&lt;br/&gt;
OldHeap: 3286780&lt;br/&gt;
Heap unused: 125117&lt;br/&gt;
OldHeap unused: 2875168&lt;br/&gt;
Program counter: 0x00007f683c076840 (gen_server:loop/6 + 256)&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
ns_server0s stats reader for saslbucket:&lt;br/&gt;
&lt;br/&gt;
=proc:&amp;lt;0.7758.0&amp;gt;&lt;br/&gt;
State: Waiting&lt;br/&gt;
Name: &amp;#39;stats_reader-saslbucket&amp;#39;&lt;br/&gt;
Spawned as: proc_lib:init_p/5&lt;br/&gt;
Spawned by: &amp;lt;0.7730.0&amp;gt;&lt;br/&gt;
Started: Mon Nov 19 13:30:40 2012&lt;br/&gt;
Message queue length: 0&lt;br/&gt;
Number of heap fragments: 0&lt;br/&gt;
Heap fragment data: 0&lt;br/&gt;
Link list: [&amp;lt;0.7730.0&amp;gt;]&lt;br/&gt;
Reductions: 5999001&lt;br/&gt;
Stack+heap: 1346269&lt;br/&gt;
OldHeap: 2629425&lt;br/&gt;
Heap unused: 392854&lt;br/&gt;
OldHeap unused: 2629326&lt;br/&gt;
Program counter: 0x00007f683c076840 (gen_server:loop/6 + 256)&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
Mochiweb&amp;#39;s server:&lt;br/&gt;
&lt;br/&gt;
=proc:&amp;lt;0.17887.7&amp;gt;&lt;br/&gt;
State: Waiting&lt;br/&gt;
Spawned as: proc_lib:init_p/5&lt;br/&gt;
Spawned by: &amp;lt;0.1003.0&amp;gt;&lt;br/&gt;
Started: Mon Nov 19 15:51:37 2012&lt;br/&gt;
Message queue length: 0&lt;br/&gt;
Number of heap fragments: 0&lt;br/&gt;
Heap fragment data: 0&lt;br/&gt;
Link list: [#Port&amp;lt;0.218061&amp;gt;, &amp;lt;0.1003.0&amp;gt;]&lt;br/&gt;
Reductions: 3278012&lt;br/&gt;
Stack+heap: 1346269&lt;br/&gt;
OldHeap: 8024355&lt;br/&gt;
Heap unused: 192102&lt;br/&gt;
OldHeap unused: 6083229&lt;br/&gt;
Program counter: 0x00007f6829c7b928 (mochiweb_http:request/2 + 88)&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
Seems garbage collector isn&amp;#39;t doing its job well. View related processes don&amp;#39;t seem to be using big quantities of memory (nor the connection pool itself).&lt;br/&gt;
&lt;br/&gt;
Finally looking at error_logger process message box, there&amp;#39;s a ton of EBADF errors on file, socket and port operations:&lt;br/&gt;
&lt;br/&gt;
E.g.:&lt;br/&gt;
&lt;br/&gt;
{notify,{error,noproc,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{emulator,&amp;quot;~s~n&amp;quot;,[&amp;quot;erts_poll_wait() failed: ebadf (9)\n&amp;quot;]}}}&lt;br/&gt;
&lt;br/&gt;
{notify,{error,noproc,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{emulator,&amp;quot;~s~n&amp;quot;,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;[&amp;quot;Bad output fd in erts_poll()! fd=42, port=#Port&amp;lt;0.7463&amp;gt;, driver=spawn, name=/opt/couchbase/bin/sigar_port\n&amp;quot;]}}}&lt;br/&gt;
&lt;br/&gt;
{notify,{error,noproc,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{emulator,&amp;quot;~s~n&amp;quot;,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;[&amp;quot;Bad input fd in erts_poll()! fd=78, port=#Port&amp;lt;0.9684&amp;gt;, driver=tcp_inet, name=tcp_inet\n&amp;quot;]}}}&lt;br/&gt;
&lt;br/&gt;
{notify,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{error_report,&amp;lt;0.56.0&amp;gt;,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{&amp;lt;0.32244.4&amp;gt;,crash_report,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;[[{initial_call,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{couch_file,spawn_writer,[&amp;#39;Argument__1&amp;#39;,&amp;#39;Argument__2&amp;#39;]}},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{pid,&amp;lt;0.32244.4&amp;gt;},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{registered_name,[]},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{error_info,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{error,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{badmatch,{error,ebadf}},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;[{couch_file,write_blocks,3},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{couch_file,writer_collect_chunks,5},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{proc_lib,init_p_do_apply,3}]}},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{ancestors,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;[&amp;lt;0.32241.4&amp;gt;,&amp;lt;0.14053.0&amp;gt;,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;lt;0.14044.0&amp;gt;,&amp;lt;0.14043.0&amp;gt;]},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{messages,[]},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{links,[&amp;lt;0.32241.4&amp;gt;,#Port&amp;lt;0.163328&amp;gt;]},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{dictionary,[]},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{trap_exit,true},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{status,running},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{heap_size,610},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{stack_size,24},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{reductions,49502568}],&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;[]]}}}&lt;br/&gt;
&lt;br/&gt;
{notify,{error,&amp;lt;0.56.0&amp;gt;,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{emulator,&amp;quot;~s~n&amp;quot;,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;[&amp;quot;Error in process &amp;lt;0.4736.7&amp;gt; on node &amp;#39;&lt;a href=&apos;mailto:ns_1@10.6.2.37&apos;&gt;ns_1@10.6.2.37&lt;/a&gt;&amp;#39; with exit value: {{badmatch,{error,ebadf}},[{couch_btree,get_node,2},{couch_btree,modify_node,8},{couch_btree,modify_kpnode,10},{couch_btree,modify_node,8},{couch_btree,modify_kpnode,10},{couch_btree,modify_node,8},{couch_btree,modify_kpnode... \n&amp;quot;]}}}&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
The view merger&amp;#39;s connection pool as a max size of 10 000 connections, but at the crash dump time, it only had 2684 connections open (I should probably decrease the limit to 5K or less).&lt;br/&gt;
&lt;br/&gt;
At startup, couchbase-server.sh sets ulimit -n to 10240 and doesn&amp;#39;t set ERL_MAX_PORTS, which means it gets the default of 1024 (according to erl -man erlang).&lt;br/&gt;
Perhaps we&amp;#39;re reaching a limit of ports (each raw file descriptor opened uses 1 port). However this still doesn&amp;#39;t explain EBADF errors, I would expect EMFILE instead.&lt;br/&gt;
&lt;br/&gt;
Alk, any reason to not set ERL_MAX_PORTS to a higher value?&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
</comment>
                    <comment id="44375" author="FilipeManana" created="Mon, 19 Nov 2012 20:14:15 -0600"  >Alk, see question above.</comment>
                    <comment id="44377" author="FilipeManana" created="Mon, 19 Nov 2012 20:36:25 -0600"  >After talking with Alk, ERL_MAX_PORTS has no effect on Unixes apparently, as it assumes the maximum allowed in the system (getting it via sysconf, etc).&lt;br/&gt;
&lt;br/&gt;
I think it&amp;#39;s worth re-trying this test with +A 16 instead of +S N:N.&lt;br/&gt;
&lt;br/&gt;
Chisheng can you try it?&lt;br/&gt;
Also, please provide the server logs next time the crash happens.&lt;br/&gt;
&lt;br/&gt;
thanks</comment>
                    <comment id="44447" author="steve" created="Tue, 20 Nov 2012 13:28:46 -0600"  >moved per bug-scrub to 2.0.1</comment>
                    <comment id="45003" author="FilipeManana" created="Thu, 29 Nov 2012 12:06:15 -0600"  >Now that I am back at home, Tried this on a 4 nodes cluster with my hardware.&lt;br/&gt;
In a 4 nodes cluster, created a view with 10M rows, and then spawned 5000 curl processes like this:&lt;br/&gt;
&lt;br/&gt;
$ for i in `seq 1 5000`; do curl -s &lt;a href=&quot;http://localhost:9500/default/_design/test/_view/view1&quot;&gt;http://localhost:9500/default/_design/test/_view/view1&lt;/a&gt; &amp;amp;  done&lt;br/&gt;
&lt;br/&gt;
After a short period, all view queries start returning an error, because there&amp;#39;s a ns_config call timeout&lt;br/&gt;
in the HTTP handler before query execution happens in view engine:&lt;br/&gt;
&lt;br/&gt;
[couchdb:error,2012-11-29T17:16:34.030,&lt;a href=&apos;mailto:n_0@192.168.1.88&apos;&gt;n_0@192.168.1.88&lt;/a&gt;:&amp;lt;0.2954.1&amp;gt;:couch_log:error:42]Uncaught error in HTTP request: {exit,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{timeout,{gen_server,call,[ns_config,get]}}}&lt;br/&gt;
&lt;br/&gt;
Stacktrace: [{diag_handler,diagnosing_timeouts,1,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;[{file,&amp;quot;src/diag_handler.erl&amp;quot;},{line,375}]},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{menelaus_auth,check_auth,1,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;[{file,&amp;quot;src/menelaus_auth.erl&amp;quot;},{line,121}]},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{menelaus_auth,bucket_auth_fun,1,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;[{file,&amp;quot;src/menelaus_auth.erl&amp;quot;},{line,168}]},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{menelaus_auth,is_bucket_accessible,2,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;[{file,&amp;quot;src/menelaus_auth.erl&amp;quot;},{line,68}]},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{capi_frontend,do_db_req,2,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;[{file,&amp;quot;src/capi_frontend.erl&amp;quot;},{line,53}]},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{httpd,handle_request,6,&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;[{file,&amp;quot;couch_httpd.erl&amp;quot;},{line,222}]},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{mochiweb_http,headers,5,[{file,&amp;quot;mochiweb_http.erl&amp;quot;},{line,136}]},&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;{proc_lib,init_p_do_apply,3,[{file,&amp;quot;proc_lib.erl&amp;quot;},{line,227}]}]&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
This seems to cause timeout_diag_logger do start dumping lots of stuff.&lt;br/&gt;
At this point I got the erlang node which received the query, to use over 3Gb of memory, at which point I killed it to get&lt;br/&gt;
an erl_crash.dump (attached here).&lt;br/&gt;
&lt;br/&gt;
Looking at that crash dump, it seems the processes which are using very high amounts of memory are mostly logger processes&lt;br/&gt;
and a few other ns_server processes:&lt;br/&gt;
&lt;br/&gt;
$ IFS=&amp;quot;\n&amp;quot;; for m in `grep &amp;#39;OldHeap:&amp;#39; erl_crash.dump | sort -n -k 2 | tail -n 15`; do egrep $m -A 3 -B 12 erl_crash.dump | grep &amp;#39;^Name&amp;#39; ; done&lt;br/&gt;
Name: standard_error&lt;br/&gt;
Name: &amp;#39;sink-disk_default&amp;#39;&lt;br/&gt;
Name: &amp;#39;sink-disk_error&amp;#39;&lt;br/&gt;
Name: &amp;#39;sink-disk_couchdb&amp;#39;&lt;br/&gt;
Name: &amp;#39;sink-disk_debug&amp;#39;&lt;br/&gt;
Name: &amp;#39;sink-stderr&amp;#39;&lt;br/&gt;
Name: timeout_diag_logger&lt;br/&gt;
Name: ns_config_events&lt;br/&gt;
Name: ns_config&lt;br/&gt;
Name: ns_port_memcached&lt;br/&gt;
Name: &amp;#39;stats_reader-default&amp;#39;&lt;br/&gt;
&lt;br/&gt;
Server logs attached here.&lt;br/&gt;
Haven&amp;#39;t experienced the EBADF errors however (which happened in the QE cluster), maybe because my max open files open is likely&lt;br/&gt;
higher than in those machines (59000).&lt;br/&gt;
&lt;br/&gt;
Aliaksey, do you think something needs to be tweaked in the logger?&lt;br/&gt;
Thanks&lt;br/&gt;
</comment>
                    <comment id="45004" author="FilipeManana" created="Thu, 29 Nov 2012 12:07:00 -0600"  >Aliaksey, see comment above.&lt;br/&gt;
Thanks</comment>
                    <comment id="45011" author="Aliaksey Artamonau" created="Thu, 29 Nov 2012 13:11:52 -0600"  >We considered hibernating logger processes. But it&amp;#39;s not clear what would be a performance impact. From what you described it seems that the main problem is again those random timeouts.</comment>
                    <comment id="45023" author="FilipeManana" created="Thu, 29 Nov 2012 13:47:51 -0600"  >Agreed, the main cause are the ns_config call timeouts in the ns_server view query HTTP handler.&lt;br/&gt;
With so many timeouts, the logger processes, and mb_master, are the processes with the biggest&lt;br/&gt;
message queue lenghts:&lt;br/&gt;
&lt;br/&gt;
=proc:&amp;lt;0.167.0&amp;gt;&lt;br/&gt;
State: Scheduled&lt;br/&gt;
Name: &amp;#39;sink-stderr&amp;#39;&lt;br/&gt;
Spawned as: proc_lib:init_p/5&lt;br/&gt;
Spawned by: &amp;lt;0.33.0&amp;gt;&lt;br/&gt;
Started: Thu Nov 29 17:07:15 2012&lt;br/&gt;
Message queue length: 3431&lt;br/&gt;
&lt;br/&gt;
=proc:&amp;lt;0.78.0&amp;gt;&lt;br/&gt;
State: Scheduled&lt;br/&gt;
Name: &amp;#39;sink-disk_debug&amp;#39;&lt;br/&gt;
Spawned as: proc_lib:init_p/5&lt;br/&gt;
Spawned by: &amp;lt;0.33.0&amp;gt;&lt;br/&gt;
Started: Thu Nov 29 17:07:15 2012&lt;br/&gt;
Message queue length: 405&lt;br/&gt;
&lt;br/&gt;
=proc:&amp;lt;0.66.0&amp;gt;&lt;br/&gt;
State: Waiting&lt;br/&gt;
Name: &amp;#39;sink-disk_error&amp;#39;&lt;br/&gt;
Spawned as: proc_lib:init_p/5&lt;br/&gt;
Spawned by: &amp;lt;0.33.0&amp;gt;&lt;br/&gt;
Started: Thu Nov 29 17:07:15 2012&lt;br/&gt;
Message queue length: 186&lt;br/&gt;
&lt;br/&gt;
=proc:&amp;lt;0.787.0&amp;gt;&lt;br/&gt;
State: Waiting&lt;br/&gt;
Name: mb_master&lt;br/&gt;
Spawned as: proc_lib:init_p/5&lt;br/&gt;
Spawned by: &amp;lt;0.371.0&amp;gt;&lt;br/&gt;
Started: Thu Nov 29 17:07:31 2012&lt;br/&gt;
Message queue length: 149&lt;br/&gt;
&lt;br/&gt;
=proc:&amp;lt;0.627.0&amp;gt;&lt;br/&gt;
State: Waiting&lt;br/&gt;
Spawned as: proc_lib:init_p/5&lt;br/&gt;
Spawned by: &amp;lt;0.580.0&amp;gt;&lt;br/&gt;
Started: Thu Nov 29 17:07:30 2012&lt;br/&gt;
Message queue length: 84&lt;br/&gt;
</comment>
                    <comment id="45527" author="kzeller" created="Wed, 5 Dec 2012 17:59:58 -0600"  >Added to RN as:   Be aware that if attempt hundreds of simultaneous queries with an unlimited &lt;br/&gt;
&amp;nbsp;&amp;nbsp;number of results, Couchbase Server may fail. For instanace 10 million &lt;br/&gt;
&amp;nbsp;&amp;nbsp;results queried simultaneously will cause the server to fail. Instead you should &lt;br/&gt;
&amp;nbsp;&amp;nbsp;specify a reasonable limit of results when you query, otherwise the &lt;br/&gt;
&amp;nbsp;&amp;nbsp;server will stall and crash due to excessive memory usage.</comment>
                    <comment id="53686" author="maria" created="Wed, 27 Mar 2013 17:00:45 -0500"  >moving to 2.1&lt;br/&gt;
</comment>
                </comments>
                    <attachments>
                    <attachment id="15916" name="erl_crash.dump.tgz" size="161047" author="FilipeManana" created="Thu, 29 Nov 2012 12:06:15 -0600" />
                    <attachment id="15917" name="logs.tgz" size="3388502" author="FilipeManana" created="Thu, 29 Nov 2012 12:06:15 -0600" />
                    <attachment id="15835" name="queries2.png" size="76649" author="tommie" created="Fri, 16 Nov 2012 15:15:47 -0600" />
                </attachments>
            <subtasks>
        </subtasks>
                <customfields>
                                                                        <customfield id="customfield_10180" key="com.atlassian.jira.ext.charting:firstresponsedate">
                <customfieldname>Date of First Response</customfieldname>
                <customfieldvalues>
                    <customfieldvalue>Fri, 16 Nov 2012 15:15:04 -0600</customfieldvalue>

                </customfieldvalues>
            </customfield>
                                                                                                                                                                                                            <customfield id="customfield_10081" key="com.pyxis.greenhopper.jira:gh-global-rank">
                <customfieldname>Rank</customfieldname>
                <customfieldvalues>
                    <customfieldvalue>3072</customfieldvalue>
                </customfieldvalues>
            </customfield>
                                                                                                                                                <customfield id="customfield_10052" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                <customfieldname>Sprint Status</customfieldname>
                <customfieldvalues>
                        <customfieldvalue key="10027"><![CDATA[Current Sprint]]></customfieldvalue>

                </customfieldvalues>
            </customfield>
                                                                                        </customfields>
    </item>
</channel>
</rss>