Testing changes to consensus
[mymsc.git] / bibliography.bib
blobea28401b7b1970ff64e8144f067b1dfaab497a55
1 % THIS IS A COPY, DO NOT EDIT IT!!
2 % REFEER TO ORIGINAL /home/rodrigo/Dropbox/papers//bibliography.bib
3 @incollection{springerlink:aguilera98,
4 author = {Aguilera, Marcos Kawazoe and Chen, Wei and Toueg, Sam},
5 affiliation = {Cornell University, Computer Science Department, Ithaca NY 14853-7501, USA {aguilera,weichen,sam}@cs.cornell.edu},
6 title = {Failure Detection and Consensus in the Crash-Recovery Model},
7 booktitle = {Distributed Computing},
8 series = {Lecture Notes in Computer Science},
9 editor = {Kutten, Shay},
10 publisher = {Springer Berlin / Heidelberg},
11 isbn = {},
12 pages = {497-497},
13 volume = {1499},
14 note = {},
15 year = {1998}
18 @InProceedings{labos2009:viera09,
19 address = {Estoril, Lisbon, Portugal},
20 affiliation = {EPFL},
21 author = {Vieira, Gustavo M. D. and Zwaenepoel, Willy and Buzato, Luis E.},
22 booktitle = {Proceedings of the 39th {I}nternational {C}onference on
23 {D}ependable {S}ystems and {N}etworks ({DSN})},
24 details = {http://infoscience.epfl.ch/record/135193},
25 documenturl = {http://infoscience.epfl.ch/getfile.py?recid=135193&mode=best},
26 doi = { },
27 location = {Estoril, Lisbon, Portugal},
28 oai-id = {oai:infoscience.epfl.ch:135193},
29 oai-set = {conf; fulltext-public; fulltext},
30 publisher = { },
31 review = {REVIEWED},
32 series = { },
33 status = {PUBLISHED},
34 title = {Dynamic {C}ontent {W}eb {A}pplications: {C}rash,
35 {F}ailover, and {R}ecovery {A}nalysis},
36 unit = {LABOS},
37 url = {http://www.dsn.org/},
38 year = 2009
41 @article{Chandra:1996:WFD:234533.234549,
42 author = {Chandra, Tushar Deepak and Hadzilacos, Vassos and Toueg, Sam},
43 title = {The weakest failure detector for solving consensus},
44 journal = {J. ACM},
45 volume = {43},
46 issue = {4},
47 month = {July},
48 year = {1996},
49 issn = {0004-5411},
50 pages = {685--722},
51 numpages = {38},
52 url = {http://doi.acm.org/10.1145/234533.234549},
53 doi = {http://doi.acm.org/10.1145/234533.234549},
54 acmid = {234549},
55 publisher = {ACM},
56 address = {New York, NY, USA},
57 keywords = {Byzantine Generals' problem, agreement problem, asynchronous systems, atomic broadcast, commit problem, consensus problem, crash failures, failure detection, fault-tolerance, message passing, partial synchrony, processor failures},
60 @inproceedings{Aguilera:2001:SLE:645958.676119,
61 author = {Aguilera, Marcos Kawazoe and Delporte-Gallet, Carole and Fauconnier, Hugues and Toueg, Sam},
62 title = {Stable Leader Election},
63 booktitle = {Proceedings of the 15th International Conference on Distributed Computing},
64 series = {DISC '01},
65 year = {2001},
66 isbn = {3-540-42605-1},
67 pages = {108--122},
68 numpages = {15},
69 url = {http://portal.acm.org/citation.cfm?id=645958.676119},
70 acmid = {676119},
71 publisher = {Springer-Verlag},
72 address = {London, UK},
75 @InProceedings{vieira08:_trepl,
76 author = {Vieira, Gustavo M. D. and Buzato, Luis E.},
77 title = {Treplica: {U}biquitous {R}eplication},
78 booktitle = {Proceedings of the 26th Brazilian Symposium on Computer Networks and Distributed Systems},
79 year = 2008}
81 @inproceedings{Aguilera:2007:SNP:1294261.1294278,
82 author = {Aguilera, Marcos K. and Merchant, Arif and Shah, Mehul and Veitch, Alistair and Karamanolis, Christos},
83 title = {Sinfonia: a new paradigm for building scalable distributed systems},
84 booktitle = {Proceedings of twenty-first ACM SIGOPS symposium on Operating systems principles},
85 series = {SOSP '07},
86 year = {2007},
87 isbn = {978-1-59593-591-5},
88 location = {Stevenson, Washington, USA},
89 pages = {159--174},
90 numpages = {16},
91 url = {http://doi.acm.org/10.1145/1294261.1294278},
92 doi = {http://doi.acm.org/10.1145/1294261.1294278},
93 acmid = {1294278},
94 publisher = {ACM},
95 address = {New York, NY, USA},
96 keywords = {distributed systems, fault tolerance, scalability, shared memory, transactions, two-phase commit},
99 @article{Lamport:1978_clocks,
100 author = {Lamport, Leslie},
101 title = {Time, clocks, and the ordering of events in a distributed system},
102 journal = {Commun. ACM},
103 volume = {21},
104 issue = {7},
105 month = {July},
106 year = {1978},
107 issn = {0001-0782},
108 pages = {558--565},
109 numpages = {8},
110 url = {http://doi.acm.org/10.1145/359545.359563},
111 doi = {http://doi.acm.org/10.1145/359545.359563},
112 acmid = {359563},
113 publisher = {ACM},
114 address = {New York, NY, USA},
115 keywords = {clock synchronization, computer networks, distributed systems, multiprocess systems},
118 @inproceedings{Isard:2007:DDD:1272996.1273005,
119 author = {Isard, Michael and Budiu, Mihai and Yu, Yuan and Birrell, Andrew and Fetterly, Dennis},
120 title = {Dryad: distributed data-parallel programs from sequential building blocks},
121 booktitle = {Proceedings of the 2nd ACM SIGOPS/EuroSys European Conference on Computer Systems 2007},
122 series = {EuroSys '07},
123 year = {2007},
124 isbn = {978-1-59593-636-3},
125 location = {Lisbon, Portugal},
126 pages = {59--72},
127 numpages = {14},
128 url = {http://doi.acm.org/10.1145/1272996.1273005},
129 doi = {http://doi.acm.org/10.1145/1272996.1273005},
130 acmid = {1273005},
131 publisher = {ACM},
132 address = {New York, NY, USA},
133 keywords = {cluster computing, concurrency, dataflow, distributed programming},
136 @article{Defago:2004,
137 author = {D\'{e}fago, Xavier and Schiper, Andr\'{e} and Urb\'{a}n, P\'{e}ter},
138 title = {Total order broadcast and multicast algorithms: Taxonomy and survey},
139 journal = {ACM Comput. Surv.},
140 volume = {36},
141 issue = {4},
142 month = {December},
143 year = {2004},
144 issn = {0360-0300},
145 pages = {372--421},
146 numpages = {50},
147 url = {http://doi.acm.org/10.1145/1041680.1041682},
148 doi = {http://doi.acm.org/10.1145/1041680.1041682},
149 acmid = {1041682},
150 publisher = {ACM},
151 address = {New York, NY, USA},
152 keywords = {Distributed systems, agreement problems, atomic broadcast, atomic multicast, classification, distributed algorithms, fault-tolerance, global ordering, group communication, message passing, survey, taxonomy, total ordering},
155 @TechReport{hadzilacos94,
156 author = {Hadzilacos, Vassos and Toueg, Sam},
157 title = {A modular approach to the specification and implementation of fault-tolerant broadcasts},
158 institution = {Department of Computer Science, Cornell University},
159 year = 1994,
160 key = {TR94-1425},
161 address = {Ithaca, NY},
162 month = {May}}
164 @inproceedings{Rodrigues:2000,
165 author = {Rodrigues, L. and Raynal, M.},
166 title = {Atomic Broadcast in Asynchronous Crash-Recovery Distributed Systems},
167 booktitle = {Proceedings of the The 20th International Conference on Distributed Computing Systems ( ICDCS 2000)},
168 series = {ICDCS '00},
169 year = {2000},
170 isbn = {0-7695-0601-1},
171 pages = {288--},
172 url = {http://portal.acm.org/citation.cfm?id=850927.851790},
173 acmid = {851790},
174 publisher = {IEEE Computer Society},
175 address = {Washington, DC, USA},
176 keywords = {Distributed Algorithms, Distributed Fault Tolerant Systems, Communication Protocols}
179 @article{Chandra:1996:UFD:226643.226647,
180 author = {Chandra, Tushar Deepak and Toueg, Sam},
181 title = {Unreliable failure detectors for reliable distributed systems},
182 journal = {J. ACM},
183 volume = {43},
184 issue = {2},
185 month = {March},
186 year = {1996},
187 issn = {0004-5411},
188 pages = {225--267},
189 numpages = {43},
190 url = {http://doi.acm.org/10.1145/226643.226647},
191 doi = {http://doi.acm.org/10.1145/226643.226647},
192 acmid = {226647},
193 publisher = {ACM},
194 address = {New York, NY, USA},
195 keywords = {Byzantine Generals' problem, agreement problem, asynchronous systems, atomic broadcast, commit problem, consensus problem, crash failures, failure detection, fault-tolerance, message passing, partial synchrony, processor failures},
198 @inproceedings{Schroeder:2007:DFR:1267903.1267904,
199 author = {Schroeder, Bianca and Gibson, Garth A.},
200 title = {Disk failures in the real world: what does an MTTF of 1,000,000 hours mean to you?},
201 booktitle = {Proceedings of the 5th USENIX conference on File and Storage Technologies},
202 year = {2007},
203 location = {San Jose, CA},
204 articleno = {1},
205 url = {http://portal.acm.org/citation.cfm?id=1267903.1267904},
206 acmid = {1267904},
207 publisher = {USENIX Association},
208 address = {Berkeley, CA, USA},
210 @article{Dwork:1988:CPP:42282.42283,
211 author = {Dwork, Cynthia and Lynch, Nancy and Stockmeyer, Larry},
212 title = {Consensus in the presence of partial synchrony},
213 journal = {J. ACM},
214 volume = {35},
215 issue = {2},
216 month = {April},
217 year = {1988},
218 issn = {0004-5411},
219 pages = {288--323},
220 numpages = {36},
221 url = {http://doi.acm.org/10.1145/42282.42283},
222 doi = {http://doi.acm.org/10.1145/42282.42283},
223 acmid = {42283},
224 publisher = {ACM},
225 address = {New York, NY, USA},
228 @ARTICLE{Boichat_deconstructingpaxos,
229 author = {Romain Boichat and Partha Dutta and Svend Frølund and Rachid Guerraoui},
230 title = {Deconstructing paxos},
231 journal = {SIGACT News},
232 year = {},
233 pages = {2003}
235 @article{Lamport:1998:PP:279227.279229,
236 author = {Lamport, Leslie},
237 title = {The part-time parliament},
238 journal = {ACM Trans. Comput. Syst.},
239 volume = {16},
240 issue = {2},
241 month = {May},
242 year = {1998},
243 issn = {0734-2071},
244 pages = {133--169},
245 numpages = {37},
246 url = {http://doi.acm.org/10.1145/279227.279229},
247 doi = {http://doi.acm.org/10.1145/279227.279229},
248 acmid = {279229},
249 publisher = {ACM},
250 address = {New York, NY, USA},
251 keywords = {state machines, three-phase commit, voting},
253 @article{Elnozahy:2002:SRP:568522.568525,
254 author = {Elnozahy, E. N. (Mootaz) and Alvisi, Lorenzo and Wang, Yi-Min and Johnson, David B.},
255 title = {A survey of rollback-recovery protocols in message-passing systems},
256 journal = {ACM Comput. Surv.},
257 issue_date = {September 2002},
258 volume = {34},
259 issue = {3},
260 month = {September},
261 year = {2002},
262 issn = {0360-0300},
263 pages = {375--408},
264 numpages = {34},
265 url = {http://doi.acm.org/10.1145/568522.568525},
266 doi = {http://doi.acm.org/10.1145/568522.568525},
267 acmid = {568525},
268 publisher = {ACM},
269 address = {New York, NY, USA},
270 keywords = {message logging, rollback-recovery},
272 @inproceedings{Koo:1986:CRD:324493.325074,
273 author = {Koo, Richard and Toueg, Sam},
274 title = {Checkpointing and rollback-recovery for distributed systems},
275 booktitle = {Proceedings of 1986 ACM Fall joint computer conference},
276 series = {ACM '86},
277 year = {1986},
278 isbn = {0-8186-4743-4},
279 location = {Dallas, Texas, United States},
280 pages = {1150--1158},
281 numpages = {9},
282 url = {http://portal.acm.org/citation.cfm?id=324493.325074},
283 acmid = {325074},
284 publisher = {IEEE Computer Society Press},
285 address = {Los Alamitos, CA, USA},
287 @article{10.1109/CCGRID.2010.40,
288 author = {Leonardo Arturo Bautista Gomez and Naoya Maruyama and Franck Cappello and Satoshi Matsuoka},
289 title = {Distributed Diskless Checkpoint for Large Scale Systems},
290 journal ={Cluster Computing and the Grid, IEEE International Symposium on},
291 volume = {0},
292 isbn = {978-0-7695-4039-9},
293 year = {2010},
294 pages = {63-72},
295 doi = {http://doi.ieeecomputersociety.org/10.1109/CCGRID.2010.40},
296 publisher = {IEEE Computer Society},
297 address = {Los Alamitos, CA, USA},
300 @article{Charron-Bost:2007:HDF:1233481.1233496,
301 author = {Charron-Bost, Bernadette and Schiper, Andr\'{e}},
302 title = {Harmful dogmas in fault tolerant distributed computing},
303 journal = {SIGACT News},
304 volume = {38},
305 issue = {1},
306 month = {March},
307 year = {2007},
308 issn = {0163-5700},
309 pages = {53--61},
310 numpages = {9},
311 url = {http://doi.acm.org/10.1145/1233481.1233496},
312 doi = {http://doi.acm.org/10.1145/1233481.1233496},
313 acmid = {1233496},
314 publisher = {ACM},
315 address = {New York, NY, USA},
317 @inproceedings{Camargos:2007:MP:1281100.1281150,
318 author = {Camargos, L\'{a}saro Jonas and Schmidt, Rodrigo Malta and Pedone, Fernando},
319 title = {Multicoordinated Paxos},
320 booktitle = {Proceedings of the twenty-sixth annual ACM symposium on Principles of distributed computing},
321 series = {PODC '07},
322 year = {2007},
323 isbn = {978-1-59593-616-5},
324 location = {Portland, Oregon, USA},
325 pages = {316--317},
326 numpages = {2},
327 url = {http://doi.acm.org/10.1145/1281100.1281150},
328 doi = {http://doi.acm.org/10.1145/1281100.1281150},
329 acmid = {1281150},
330 publisher = {ACM},
331 address = {New York, NY, USA},
332 keywords = {Paxos, atomic broadcast, consensus, generalized, multicoordinated},
334 @inproceedings{Fischer:1983:CPU:647891.739594,
335 author = {Fischer, Michael J.},
336 title = {The Consensus Problem in Unreliable Distributed Systems (A Brief Survey)},
337 booktitle = {Proceedings of the 1983 International FCT-Conference on Fundamentals of Computation Theory},
338 year = {1983},
339 isbn = {3-540-12689-9},
340 pages = {127--140},
341 numpages = {14},
342 url = {http://portal.acm.org/citation.cfm?id=647891.739594},
343 acmid = {739594},
344 publisher = {Springer-Verlag},
345 address = {London, UK},
347 @article{10.1109/SRDS.2009.25,
348 author = {Omid Shahmirzadi and Sergio Mena and Andre Schiper},
349 title = {Relaxed Atomic Broadcast: State-Machine Replication Using Bounded Memory},
350 journal ={Reliable Distributed Systems, IEEE Symposium on},
351 volume = {0},
352 issn = {1060-9857},
353 year = {2009},
354 pages = {3-11},
355 doi = {http://doi.ieeecomputersociety.org/10.1109/SRDS.2009.25},
356 publisher = {IEEE Computer Society},
357 address = {Los Alamitos, CA, USA},
360 @inproceedings{Okun:2002:NSR:829526.831119,
361 author = {Okun, Michael and Barak, Amnon},
362 title = {On Node State Reconstruction for Fault Tolerant Distributed Algorithms},
363 booktitle = {Proceedings of the 21st IEEE Symposium on Reliable Distributed Systems},
364 series = {SRDS '02},
365 year = {2002},
366 isbn = {0-7695-1659-9},
367 pages = {160--},
368 url = {http://portal.acm.org/citation.cfm?id=829526.831119},
369 acmid = {831119},
370 publisher = {IEEE Computer Society},
371 address = {Washington, DC, USA},
372 keywords = {Distributed algorithms, fault tolerance, state reconstruction, recovery},
375 @inproceedings{Chandra:2007:PML:1281100.1281103,
376 author = {Chandra, Tushar D. and Griesemer, Robert and Redstone, Joshua},
377 title = {Paxos made live: an engineering perspective},
378 booktitle = {Proceedings of the twenty-sixth annual ACM symposium on Principles of distributed computing},
379 series = {PODC '07},
380 year = {2007},
381 isbn = {978-1-59593-616-5},
382 location = {Portland, Oregon, USA},
383 pages = {398--407},
384 numpages = {10},
385 url = {http://doi.acm.org/10.1145/1281100.1281103},
386 doi = {http://doi.acm.org/10.1145/1281100.1281103},
387 acmid = {1281103},
388 publisher = {ACM},
389 address = {New York, NY, USA},
390 keywords = {Paxos, experiences, fault-tolerance, implementation},
393 @conference{juang2002crash,
394 title={{Crash recovery with little overhead}},
395 author={Juang, T.T.Y. and Venkatesan, S.},
396 booktitle={Distributed Computing Systems, 1991., 11th International Conference on},
397 pages={454--461},
398 isbn={0818621443},
399 year={2002},
400 organization={IEEE}
402 @inproceedings{Freiling:2009:MCA:1729641.1730101,
403 author = {Freiling, Felix C. and Lambertz, Christian and Majster-Cederbaum, Mila},
404 title = {Modular Consensus Algorithms for the Crash-Recovery Model},
405 booktitle = {Proceedings of the 2009 International Conference on Parallel and Distributed Computing, Applications and Technologies},
406 series = {PDCAT '09},
407 year = {2009},
408 isbn = {978-0-7695-3914-0},
409 pages = {287--292},
410 numpages = {6},
411 url = {http://dx.doi.org/10.1109/PDCAT.2009.50},
412 doi = {http://dx.doi.org/10.1109/PDCAT.2009.50},
413 acmid = {1730101},
414 publisher = {IEEE Computer Society},
415 address = {Washington, DC, USA},
416 keywords = {asynchronous systems, consensus, fault tolerance, process crash and recovery, stable storage},
418 @TechReport{oliveira97:consensus,
419 author = {Oliveira, R. and Guerraoui, R. and Schiper, A.},
420 title = {Consensus in the crash-recover model},
421 institution = {Département d'Informatique, Ecole Polytechnique Fédérale},
422 year = 1997,
423 number = {97-239},
424 address = {Laussane, Switzerland},
425 month = {August}}
428 @inproceedings{Freiling:2008:ECA:1432291.1432332,
429 author = {Freiling, Felix C. and Lambertz, Christian and Majster-Cederbaum, Mila},
430 title = {Easy Consensus Algorithms for the Crash-Recovery Model},
431 booktitle = {Proceedings of the 22nd international symposium on Distributed Computing},
432 series = {DISC '08},
433 year = {2008},
434 isbn = {978-3-540-87778-3},
435 location = {Arcachon, France},
436 pages = {507--508},
437 numpages = {2},
438 url = {http://dx.doi.org/10.1007/978-3-540-87779-0_39},
439 doi = {http://dx.doi.org/10.1007/978-3-540-87779-0_39},
440 acmid = {1432332},
441 publisher = {Springer-Verlag},
442 address = {Berlin, Heidelberg},
444 @article{Vieira:2008:CRF:1390853.1390875,
445 author = {Vieira, Gustavo M. D. and Buzato, Luiz E.},
446 title = {On the coordinator's rule for Fast Paxos},
447 journal = {Inf. Process. Lett.},
448 volume = {107},
449 issue = {5},
450 month = {August},
451 year = {2008},
452 issn = {0020-0190},
453 pages = {183--187},
454 numpages = {5},
455 url = {http://portal.acm.org/citation.cfm?id=1390853.1390875},
456 doi = {10.1016/j.ipl.2008.03.001},
457 acmid = {1390875},
458 publisher = {Elsevier North-Holland, Inc.},
459 address = {Amsterdam, The Netherlands, The Netherlands},
460 keywords = {Consensus, Crash-recovery, Distributed systems, Paxos},
463 @article{Schneider:1990:IFS:98163.98167,
464 author = {Schneider, Fred B.},
465 title = {Implementing fault-tolerant services using the state machine approach: a tutorial},
466 journal = {ACM Comput. Surv.},
467 volume = {22},
468 issue = {4},
469 month = {December},
470 year = {1990},
471 issn = {0360-0300},
472 pages = {299--319},
473 numpages = {21},
474 url = {http://doi.acm.org/10.1145/98163.98167},
475 doi = {http://doi.acm.org/10.1145/98163.98167},
476 acmid = {98167},
477 publisher = {ACM},
478 address = {New York, NY, USA},
481 @article{lamport1978implementation,
482 title={{The implementation of reliable distributed multiprocess systems}},
483 author={Lamport, L.},
484 journal={Computer Networks (1976)},
485 volume={2},
486 number={2},
487 pages={95--114},
488 issn={0376-5075},
489 year={1978},
490 publisher={Elsevier}
492 @inproceedings{Oki:1988:VRN:62546.62549,
493 author = {Oki, Brian M. and Liskov, Barbara H.},
494 title = {Viewstamped Replication: A New Primary Copy Method to Support Highly-Available Distributed Systems},
495 booktitle = {Proceedings of the seventh annual ACM Symposium on Principles of distributed computing},
496 series = {PODC '88},
497 year = {1988},
498 isbn = {0-89791-277-2},
499 location = {Toronto, Ontario, Canada},
500 pages = {8--17},
501 numpages = {10},
502 url = {http://doi.acm.org/10.1145/62546.62549},
503 doi = {http://doi.acm.org/10.1145/62546.62549},
504 acmid = {62549},
505 publisher = {ACM},
506 address = {New York, NY, USA},
508 @article{Lamport:1982:BGP:357172.357176,
509 author = {Lamport, Leslie and Shostak, Robert and Pease, Marshall},
510 title = {The Byzantine Generals Problem},
511 journal = {ACM Trans. Program. Lang. Syst.},
512 issue_date = {July 1982},
513 volume = {4},
514 issue = {3},
515 month = {July},
516 year = {1982},
517 issn = {0164-0925},
518 pages = {382--401},
519 numpages = {20},
520 url = {http://doi.acm.org/10.1145/357172.357176},
521 doi = {http://doi.acm.org/10.1145/357172.357176},
522 acmid = {357176},
523 publisher = {ACM},
524 address = {New York, NY, USA},
526 @article{Neiger:1990:AIF:83334.83337,
527 author = {Neiger, Gil and Toueg, Sam},
528 title = {Automatically increasing the fault-tolerance of distributed algorithms},
529 journal = {J. Algorithms},
530 volume = {11},
531 issue = {3},
532 month = {September},
533 year = {1990},
534 issn = {0196-6774},
535 pages = {374--419},
536 numpages = {46},
537 url = {http://portal.acm.org/citation.cfm?id=83334.83337},
538 doi = {10.1016/0196-6774(90)90019-B},
539 acmid = {83337},
540 publisher = {Academic Press, Inc.},
541 address = {Duluth, MN, USA},
543 @TechReport{vieira10:implementing-tr,
544 author = {Vieira, Gustavo and Buzato, Luiz},
545 title = {Implementation of an Object-Oriented Specification
546 for Active Replication Using Consensus},
547 institution = {Instituto de Computação, Universidade Estadual de Campinas},
548 year = 2010,
549 number = {IC-10-26},
550 month = {August}}
552 @article{Reynal:2005:SIF:1052796.1052806,
553 author = {Reynal, Michel},
554 title = {A short introduction to failure detectors for asynchronous distributed systems},
555 journal = {SIGACT News},
556 volume = {36},
557 issue = {1},
558 month = {March},
559 year = {2005},
560 issn = {0163-5700},
561 pages = {53--70},
562 numpages = {18},
563 url = {http://doi.acm.org/10.1145/1052796.1052806},
564 doi = {http://doi.acm.org/10.1145/1052796.1052806},
565 acmid = {1052806},
566 publisher = {ACM},
567 address = {New York, NY, USA},
569 @inproceedings{Bonnet:2010:CAD:1825731.1826088,
570 author = {Bonnet, Fran\c{c}ois and Raynal, Michel},
571 title = {Consensus in Anonymous Distributed Systems: Is There a Weakest Failure Detector?},
572 booktitle = {Proceedings of the 2010 24th IEEE International Conference on Advanced Information Networking and Applications},
573 series = {AINA '10},
574 year = {2010},
575 isbn = {978-0-7695-4018-4},
576 pages = {206--213},
577 numpages = {8},
578 url = {http://dx.doi.org/10.1109/AINA.2010.19},
579 doi = {http://dx.doi.org/10.1109/AINA.2010.19},
580 acmid = {1826088},
581 publisher = {IEEE Computer Society},
582 address = {Washington, DC, USA},
584 @inproceedings{Gupta:2001:SED:383962.384010,
585 author = {Gupta, Indranil and Chandra, Tushar D. and Goldszmidt, Germ\'{a}n S.},
586 title = {On scalable and efficient distributed failure detectors},
587 booktitle = {Proceedings of the twentieth annual ACM symposium on Principles of distributed computing},
588 series = {PODC '01},
589 year = {2001},
590 isbn = {1-58113-383-9},
591 location = {Newport, Rhode Island, United States},
592 pages = {170--179},
593 numpages = {10},
594 url = {http://doi.acm.org/10.1145/383962.384010},
595 doi = {http://doi.acm.org/10.1145/383962.384010},
596 acmid = {384010},
597 publisher = {ACM},
598 address = {New York, NY, USA},
599 keywords = {accuracy, distributed systems, efficiency, failure detectors, scalability},
601 @inproceedings{Xiong:2009:SFD:1632708.1633468,
602 author = {Xiong, Naixue and Yang, Yan and Cao, Ming and He, Jing and Shu, Lei},
603 title = {A Survey on Fault-Tolerance in Distributed Network Systems},
604 booktitle = {Proceedings of the 2009 International Conference on Computational Science and Engineering - Volume 02},
605 series = {CSE '09},
606 year = {2009},
607 isbn = {978-0-7695-3823-5},
608 pages = {1065--1070},
609 numpages = {6},
610 url = {http://dx.doi.org/10.1109/CSE.2009.497},
611 doi = {http://dx.doi.org/10.1109/CSE.2009.497},
612 acmid = {1633468},
613 publisher = {IEEE Computer Society},
614 address = {Washington, DC, USA},
615 keywords = {Failure detector, Fault-tolerance, Network Systems, Quality-of-service},
617 @article{10.1109/CSE.2009.497,
618 author = {Naixue Xiong and Yan Yang and Ming Cao and Jing He and Lei Shu},
619 title = {A Survey on Fault-Tolerance in Distributed Network Systems},
620 journal ={Computational Science and Engineering, IEEE International Conference on},
621 volume = {2},
622 isbn = {978-0-7695-3823-5},
623 year = {2009},
624 pages = {1065-1070},
625 doi = {http://doi.ieeecomputersociety.org/10.1109/CSE.2009.497},
626 publisher = {IEEE Computer Society},
627 address = {Los Alamitos, CA, USA},
630 @article{Chockler:2001:GCS:503112.503113,
631 author = {Chockler, Gregory V. and Keidar, Idit and Vitenberg, Roman},
632 title = {Group communication specifications: a comprehensive study},
633 journal = {ACM Comput. Surv.},
634 issue_date = {December 2001},
635 volume = {33},
636 issue = {4},
637 month = {December},
638 year = {2001},
639 issn = {0360-0300},
640 pages = {427--469},
641 numpages = {43},
642 url = {http://doi.acm.org/10.1145/503112.503113},
643 doi = {http://doi.acm.org/10.1145/503112.503113},
644 acmid = {503113},
645 publisher = {ACM},
646 address = {New York, NY, USA},
647 keywords = {Group communication systems, partitionable group membership, process group membership, specifications of group communication systems, view synchrony, virtual synchrony},
649 @inproceedings{Hurfin:1998:CAS:829523.830974,
650 author = {M. Hurfin, A. Most\'{e}faoui and Raynal, M.},
651 title = {Consensus in Asynchronous Systems Where Processes Can Crash and Recover},
652 booktitle = {Proceedings of the The 17th IEEE Symposium on Reliable Distributed Systems},
653 series = {SRDS '98},
654 year = {1998},
655 isbn = {0-8186-9218-9},
656 pages = {280--},
657 url = {http://portal.acm.org/citation.cfm?id=829523.830974},
658 acmid = {830974},
659 publisher = {IEEE Computer Society},
660 address = {Washington, DC, USA},
662 @inproceedings{Pinheiro:2007:FTL:1267903.1267905,
663 author = {Pinheiro, Eduardo and Weber, Wolf-Dietrich and Barroso, Luiz Andr\'{e}},
664 title = {Failure trends in a large disk drive population},
665 booktitle = {Proceedings of the 5th USENIX conference on File and Storage Technologies},
666 year = {2007},
667 location = {San Jose, CA},
668 pages = {2--2},
669 numpages = {1},
670 url = {http://portal.acm.org/citation.cfm?id=1267903.1267905},
671 acmid = {1267905},
672 publisher = {USENIX Association},
673 address = {Berkeley, CA, USA},
675 @ARTICLE{gray07:empirical,
676 author = {{Gray}, J. and {van Ingen}, C.},
677 title = "{Empirical Measurements of Disk Failure Rates and Error Rates}",
678 journal = {ArXiv Computer Science e-prints},
679 eprint = {arXiv:cs/0701166},
680 keywords = {Computer Science - Databases, Computer Science - Architecture},
681 year = 2007,
682 month = jan,
683 adsurl = {http://adsabs.harvard.edu/abs/2007cs........1166G},
685 @article{10.1109/SRDS.2008.9,
686 author = {Timo Warns and Christian Storm and Wilhelm Hasselbring},
687 title = {Availability of Globally Distributed Nodes: An Empirical Evaluation},
688 journal ={Reliable Distributed Systems, IEEE Symposium on},
689 volume = {0},
690 issn = {1060-9857},
691 year = {2008},
692 pages = {279-284},
693 doi = {http://doi.ieeecomputersociety.org/10.1109/SRDS.2008.9},
694 publisher = {IEEE Computer Society},
695 address = {Los Alamitos, CA, USA},
698 @article{10.1109/SRDS.2010.17,
699 author = {Raphael Marcos Menderico and Islene Calciolari Garcia},
700 title = {Diskless Checkpointing with Rollback-Dependency Trackability},
701 journal ={Reliable Distributed Systems, IEEE Symposium on},
702 volume = {0},
703 issn = {1060-9857},
704 year = {2010},
705 pages = {275-281},
706 doi = {http://doi.ieeecomputersociety.org/10.1109/SRDS.2010.17},
707 publisher = {IEEE Computer Society},
708 address = {Los Alamitos, CA, USA},
711 @article{Chandy:1985:DSD:214451.214456,
712 author = {Chandy, K. Mani and Lamport, Leslie},
713 title = {Distributed snapshots: determining global states of distributed systems},
714 journal = {ACM Trans. Comput. Syst.},
715 volume = {3},
716 issue = {1},
717 month = {February},
718 year = {1985},
719 issn = {0734-2071},
720 pages = {63--75},
721 numpages = {13},
722 url = {http://doi.acm.org/10.1145/214451.214456},
723 doi = {http://doi.acm.org/10.1145/214451.214456},
724 acmid = {214456},
725 publisher = {ACM},
726 address = {New York, NY, USA},
728 @article{Randell:1978:RIC:356725.356729,
729 author = {Randell, B. and Lee, P. and Treleaven, P. C.},
730 title = {Reliability Issues in Computing System Design},
731 journal = {ACM Comput. Surv.},
732 volume = {10},
733 issue = {2},
734 month = {June},
735 year = {1978},
736 issn = {0360-0300},
737 pages = {123--165},
738 numpages = {43},
739 url = {http://doi.acm.org/10.1145/356725.356729},
740 doi = {http://doi.acm.org/10.1145/356725.356729},
741 acmid = {356729},
742 publisher = {ACM},
743 address = {New York, NY, USA},
746 @article{Pease:1980:RAP:322186.322188,
747 author = {Pease, M. and Shostak, R. and Lamport, L.},
748 title = {Reaching Agreement in the Presence of Faults},
749 journal = {J. ACM},
750 volume = {27},
751 issue = {2},
752 month = {April},
753 year = {1980},
754 issn = {0004-5411},
755 pages = {228--234},
756 numpages = {7},
757 url = {http://doi.acm.org/10.1145/322186.322188},
758 doi = {http://doi.acm.org/10.1145/322186.322188},
759 acmid = {322188},
760 publisher = {ACM},
761 address = {New York, NY, USA},