%% $Header: /users/fgaertner/cvsroot/felix/tex/bibliographies/felix-stabilization.bib,v 1.3 2002/11/26 19:19:20 fgaertner Exp $ %%% Edited by Felix Gaertner <felix at informatik.tu-darmstadt.de> %%% %%% ==================================================================== %%% BibTeX-file{ %%% author = "Felix C. Gaertner", %%% version = "see RCS Header", %%% date = "see RCS Header", %%% time = "see RCS Header", %%% filename = "felix-stabilization.bib", %%% address = "EPFL, I&C, LPD, Switzerland", %%% telephone = "+41-21 693 7501", %%% FAX = "+41 21 693 7570", %%% URL = "http://lpdwww.epfl.ch/fgaertner/", %%% checksum = "XXX", %%% email = "fgaertner at lpdmail.epfl.ch, %%% fcg at acm.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "bibliography, stabilization, fault-tolerance", %%% supported = "no", %%% docstring = "This BibTeX file records books and articles %%% about fault-tolerance, including topics %%% like stabilization, self-stabilization and %%% whatever seems important to me. The annote %%% field contains short content descriptions %%% for my own personal use which might be %%% interesting for others too. The ISBN %%% fields will be printed if the is-alpha.bst %%% or is-plain.bst style files are used. %%% %%% BibTeX citation tags are uniformly chosen %%% as name:year:abbrev, where name is the %%% family name of the first author or editor, %%% year is a 4-digit number, and abbrev is a %%% 3-letter condensation of important title %%% words. Citation tags were automatically %%% generated by the biblabel software %%% developed for the BibNet Project. %%% %%% In this bibliography, entries are sorted %%% first by ascending year, and within each %%% year, alphabetically by author or editor, %%% and then, if necessary, by the 3-letter %%% abbreviation at the end of the BibTeX %%% citation tag, using the bibsort -byyear %%% utility. Year order has been chosen to %%% make it easier to identify the most recent %%% work. %%% %%% The bibsort utility, and several related %%% programs for bibliography maintenance, is %%% available on ftp.math.utah.edu in %%% /pub/tex/bib, and at other Internet sites %%% which mirror it, including the %%% Comprehensive TeX Archive Network (CTAN); %%% the command `finger ctan<at>pip.shsu.edu' %%% will produce a list of CTAN hosts. %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility." %%% } %%% ==================================================================== %%% %%% Thanks go to: %%% Nelson Beebe %%% %======================================================================= % Acknowledgement abbreviations: %======================================================================= % Institutional abbreviations: @String{inst-STAN-CS = "Stanford University, Department of Computer Science"} %======================================================================= % Journal abbreviations: @string{j-ACM = "Journal of the ACM"} @String{j-ACM-ADALET = "ACM Ada Letters"} @String{j-ACM-COMPREV = "ACM Computing Reviews"} @String{j-ACM-COMP-SURVEYS = "ACM Computing Surveys"} @String{j-APL-QUOTE-QUAD = "APL Quote Quad"} @String{j-CACM = "Communications of the ACM"} @String{j-CCCUJ = "C/C++ Users Journal"} @String{j-COMP-J = "The Computer Journal"} @String{j-COMP-LANG-MAG = "Computer Language Magazine"} @String{j-COMPUT-STAT-Q = "Computational Statistics Quarterly"} @String{j-COMPUTER = "Computer"} @string{j-DC = "Distributed Computing"} @String{j-DDJ = "Dr. Dobb's Journal of Software Tools"} @String{j-IEEE-ASSP-MAG = "IEEE ASSP magazine: a publication of the IEEE Acoustics, Speech, and Signal Processing Society"} @String{j-IEEE-SOFTWARE = "IEEE Software"} @string{j-IEEE-COMPUTER = "IEEE Computer"} @String{j-IEEE-TRANS-SOFTW-ENG = "IEEE Transactions on Software Engineering"} @string{j-IEEE-TRANS-COMP = "IEEE Transactions on Computers"} @String{j-IFIP-TRANS-A = "IFIP Transactions. A. Computer Science and Technology"} @String{j-INFO-PROC-SOC-JAPAN = "Journal of the Information Processing Society of Japan = Joho Shori"} @string{j-IPL = "Information Processing Letters"} @String{j-INFORMATIE = "Informatie"} @String{j-IS = "Informatik Spektrum"} @String{j-J-COMP-SCI-TECH = "Journal of Computer Science and Technology"} @String{j-J-OOP = "Journal of Object Oriented Programming"} @String{j-LINUX-JOURNAL = "Linux Journal"} @String{j-RS-MAGAZINE = "RS\slash Magazine"} @String{j-SEJ = "Software Engineering Journal"} @String{j-SIGCSE = "SIGCSE Bulletin (ACM Special Interest Group on Computer Science Education)"} @String{j-SIGPLAN = "ACM SIGPLAN Notices"} @String{j-SOFTWARE-CONCEPTS-TOOLS = "Software --- Concepts and Tools"} @String{j-SPE = "Soft{\-}ware\emdash Prac{\-}tice and Experience"} @String{j-STRUCT-PROGRAM = "Structured Programming"} @String{j-SUNEXPERT = "SunExpert"} @String{j-TEXHAX = "{\TeX{}{\-}hax}"} @String{j-TEXNIQUES = "{\TeX{}}{\-}niques, Publications for the {\TeX{}} community"} @String{j-TOPLAS = "ACM Transactions on Programming Languages and Systems"} @String{j-TOCS = "ACM Transactions on Computer Systems"} @String{j-TUGBOAT = "{\TUB{}}"} %======================================================================= % Proceedings abbreviations: @string{pro-ftcs85 = "Proceedings of the 15th IEEE Symposium on Fault Tolerant Computing Systems (FTCS-15)"} @string{pro-ftcs93 = "Proceedings of the 23rd IEEE Symposium on Fault Tolerant Computing Systems (FTCS-23)"} @string{pro-ftcs96 = "Proceedings of the 26th IEEE Symposium on Fault Tolerant Computing Systems (FTCS-26)"} @string{pro-ftcs97 = "Proceedings of the 27th IEEE Symposium on Fault Tolerant Computing Systems (FTCS-27)"} @string{pro-ftcs98 = "Proceedings of the 28th IEEE Symposium on Fault Tolerant Computing Systems (FTCS-28)"} @string{pro-ftcs98-fastabs = "Digest of FastAbstracts of the 28th IEEE Symposium on Fault Tolerant Computing Systems (FTCS-28)"} @string{pro-wdag89 = "Proceedings of the 3rd International Workshop on Distributed Algorithms (WDAG89)"} @string{pro-wdag90 = "Proceedings of the 4th International Workshop on Distributed Algorithms (WDAG90)"} @string{pro-wdag91 = "Proceedings of the 5th International Workshop on Distributed Algorithms (WDAG91)"} @string{pro-wdag92 = "Proceedings of the 6th International Workshop on Distributed Algorithms (WDAG92)"} @string{pro-wdag93 = "Proceedings of the 7th International Workshop on Distributed Algorithms (WDAG93)"} @string{pro-wdag94 = "Proceedings of the 8th International Workshop on Distributed Algorithms (WDAG94)"} @string{pro-wdag95 = "Proceedings of the 9th International Workshop on Distributed Algorithms (WDAG95)"} @string{pro-wdag96 = "Proceedings of the 10th International Workshop on Distributed Algorithms (WDAG96)"} @string{pro-wdag97 = "Proceedings of the 11th International Workshop on Distributed Algorithms (WDAG97)"} @string{pro-podc84 = "Proceedings of the 3rd Annual ACM Symposium on Principles of Distributed Computing (PODC'84)"} @string{pro-podc90 = "Proceedings of the 9th Annual ACM Symposium on Principles of Distributed Computing (PODC'90)"} @string{pro-podc91 = "Proceedings of the 10th Annual ACM Symposium on Principles of Distributed Computing (PODC'91)"} @string{pro-podc92 = "Proceedings of the 11th Annual ACM Symposium on Principles of Distributed Computing (PODC'92)"} @string{pro-podc93 = "Proceedings of the 12th Annual ACM Symposium on Principles of Distributed Computing (PODC'93)"} @string{pro-podc94 = "Proceedings of the 13th Annual ACM Symposium on Principles of Distributed Computing (PODC'94)"} @string{pro-podc95 = "Proceedings of the 14th Annual ACM Symposium on Principles of Distributed Computing (PODC'95)"} @string{pro-podc96 = "Proceedings of the 15th Annual ACM Symposium on Principles of Distributed Computing (PODC'96)"} @string{pro-podc97 = "Proceedings of the 16th Annual ACM Symposium on Principles of Distributed Computing (PODC97)"} @string{pro-podc98 = "Proceedings of the 17th Annual ACM Symposium on Principles of Distributed Computing (PODC'98)"} @string{pro-podc99 = "Proceedings of the 18th Annual ACM Symposium on Principles of Distributed Computing (PODC'99)"} @string{pro-srds91 = "Proceedings of the 10th IEEE Symposium on Reliable Distributed Systems (SRDS91)"} @string{pro-srds92 = "Proceedings of the 11th IEEE Symposium on Reliable Distributed Systems (SRDS92)"} @string{pro-srds94 = "Proceedings of the 13th IEEE Symposium on Reliable Distributed Systems (SRDS94)"} @string{pro-srds95 = "Proceedings of the 14th IEEE Symposium on Reliable Distributed Systems (SRDS95)"} @string{pro-srds2000 = "Proceedings of the 19th IEEE Symposium on Reliable Distributed Systems (SRDS2000)"} @string{pro-wss95 = "Proceedings of the 2nd Workshop on Self-Stabilizing Systems"} @string{pro-wss97 = "Proceedings of the 3rd Workshop on Self-Stabilizing Systems"} @string{pro-wss99 = "Proceedings of the 19th IEEE International Conference on Distributed Computing Systems Workshop on Self-Stabilizing Systems"} @string{pro-icdcs94 = "Proceedings of the 14th IEEE International Conference on Distributed Computing Systems (ICDCS94)"} @string{pro-icdcs96 = "Proceedings of the 16th IEEE International Conference on Distributed Computing Systems (ICDCS96)"} @string{pro-icdcs98 = "Proceedings of the 18th IEEE International Conference on Distributed Computing Systems (ICDCS98)"} @string{pro-icdcs99 = "Proceedings of the 19th IEEE International Conference on Distributed Computing Systems (ICDCS99)"} @string{asa = " International Symposium on Agent Systems and Applications"} @string{ma = " International Symposium on Mobile Agents"} @string{asama2000 = "Proceedings of the " # "Second" # asa # " and Fourth" # ma # " (ASA/MA2000)"} %======================================================================= % Publishers and their addresses: @String{pub-ACM = "ACM Press, New York"} @String{pub-ACM:adr = "New York, NY 10036, USA"} @String{pub-AW = "Ad{\-d}i{\-s}on-Wes{\-l}ey, Reading, MA"} @String{pub-AW:adr = "Reading, MA, USA"} @String{pub-BENCUM = "Benjamin/Cummings Pub. Co."} @String{pub-BENCUM:adr = "Redwood City, CA, USA"} @String{pub-IEEE = "IEEE Computer Society Press"} @String{pub-IEEE-CSP = "IEEE Computer Society Press"} @String{pub-IEEE-CSP:adr = "Los Alamitos, CA, USA"} @String{pub-IEEE:adr = "1109 Spring Street, Suite 300, Silver Spring, MD 20910, USA"} @String{pub-ITCP = "International Thomson Computer Press"} @String{pub-ITCP:adr = "20 Park Plaza Suite 1001, Boston, MA 02116 USA"} @String{pub-ITP = "International Thomson Publishing"} @String{pub-ITP:adr = "5101 Madison Road, Cincinnati, OH 45227, USA"} @String{pub-MH = "McGraw-Hill"} @String{pub-MH:adr = "New York, NY, USA"} @String{pub-MIT = "MIT Press"} @String{pub-MIT:adr = "Cambridge, MA, USA"} @String{pub-PH = "Pren{\-}tice-Hall"} @String{pub-PH:adr = "Englewood Cliffs, NJ, USA"} @String{pub-SUCSLI = "Stanford University Center for the Study of Language and Information"} @String{pub-SUCSLI:adr = "Stanford, CA, USA"} @String{pub-SV = "Spring{\-}er-Ver{\-}lag"} @String{pub-SV:adr = "Berlin, Germany~/ Heidelberg, Germany~/ London, UK~/ etc."} @String{pub-TEXPLORATOR = "The {\TeX}plorators Corporation"} @String{pub-TEXPLORATOR:adr = "3701 W. Alabama, Suite 450-273, Houston, TX 77027, USA"} @String{pub-USENIX = "USENIX"} @String{pub-USENIX:adr = "Berkeley, CA, USA"} @String{pub-VNR = "Van Nostrand Reinhold"} @String{pub-VNR:adr = "New York, NY, USA"} @String{pub-WORLD-SCI = "World Scientific Publishing Co. Pte. Ltd."} @String{pub-WORLD-SCI:adr = "P. O. Box 128, Farrer Road, Singapore 9128"} %======================================================================= % Series abbreviations: @String{ser-LNCS = "Lecture Notes in Computer Science"} %======================================================================= % Bibliography entries. @InProceedings{Floyd:1967:AMP, author = "R. W. Floyd", title = "Assigning meaning to programs", editor = "J. T. Schwartz", booktitle = "Mathematical aspects of computer science: Proc. American Mathematics Soc. symposia", year = "1967", volume = "19", pages = "19--31", address = "Providence RI", publisher = "American Mathematical Society", annote = "[to get] first idea of termination function to prove termination of algorithms." } @ARTICLE{Dijkstra:1974:SSS, AUTHOR = "Edsger W. Dijkstra", TITLE = "Self stabilizing systems in spite of distributed control", JOURNAL = j-CACM, VOLUME = 17, NUMBER = 11, YEAR = 1974, PAGES = "643--644", annote = "Standard reference to the introduction of the notion of self-stabilization into computer science." } @Article{Manna:1974:AAT, author = "Zohar Manna and Amir Pnueli", title = "Axiomatic approach to total correctness of programs", journal = "Acta Informatica", volume = "3", pages = "243--263", year = "1974", annote = "[to get] Call termination function ``convergence function''." } @Book{Niemann:1974:MDM, author = {H. Niemann}, ALTeditor = {}, title = {{Methoden der Mustererkennung}}, publisher = {Akademische Verlagsgesellschaft}, year = {1974}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Frankfurt}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {[Angabe von Armin]} } @Article{Dijkstra:1975:GCN, author = {Edsger W. Dijkstra}, title = "Guarded commands, nondeterminacy, and formal derivation of programs", journal = j-CACM, year = 1975, volume = 18, number = 8, month = aug, pages = "453--457", OPTannote = {} } @Article{Katz:1975:CLT, author = "Shmuel M. Katz and Zohar Manna", month = dec, year = "1975", title = "A closer look at termination", journal = "Acta Informatica", volume = "5", number = "4", pages = "333--352", annote = "[to get] A comparison of four termination proving methods." } @Article{Avizienis:1976:FTS, author = "Algirdas Avi\v{z}ienis", title = "Fault-tolerant systems", OPTcrossref = "", OPTkey = "", journal = j-IEEE-TRANS-COMP, year = "1976", volume = "25", number = "12", pages = "1304--1312", month = dec, OPTnote = "", annote = "This is a good and surprisingly advanced survey of fault tolerance issues (mainly in hardware) as of 1976. The main points include comparing the traditional `fault intolerant' approach which aims on taking only the most reliable components and putting them together without employing redundancy and relying on manual maintenance in case of failures, with the fault tolerant approach, which uses protective redundancy. While the former can be less costly in many situations, the latter is source for higher dependapbility figures and has psychological advantages if human lives could be endangered by the system. However, the two approaches are complementary! Furthermore, Avizienis describes three aspects of fault tolerance that have to be dealt with: (1) identification and characterization of the fault set to be tolerated, (2) development and choice of redundancy techniques, (3) analytic or experimental prediction of the effectiveness of the techniques. He also classifies faults by duration, extent and value, and identifies three forms of redundancy: hardware, software and time. He gives a first notion of the two necessary steps of detection and correction (see \cite{Arora:1998:CDM}) and a lot of examples of fault tolerant systems up to the year 1976. Overall, a rich and despite its age still insight-heavy paper." } @Article{Denning:1976:LMS, author = "Dorothy E. Denning", title = "A Lattice Model of Secure Information Flow", journal = j-CACM, volume = "19", number = "5", pages = "236--243", month = may, year = "1976", OPTnote = "Papers from the Fifth ACM Symposium on Operating Systems Principles (Univ. Texas, Austin, Tex., 1975).", abstract = "Mechanisms that guarantee secure information flow in a computer system are discussed. These mechanisms are examined within a mathematical framework suitable for formulating the requirements of secure information flow among security classes. The central component of the model is a lattice structure derived from the security classes and justified by the semantics of information flow. The model provides a unifying view of all systems that restrict information flow, enables a classification of them according to security objectives, and suggests some new approaches. It also leads to the construction of automatic program certification mechanisms for verifying the secure flow of information through a program.", keywords = "computer operating systems; data processing; lattice; mathematical models; program certification; secure information flow; security; security classes; security of data", treatment = "A Application; T Theoretical or Mathematical", annote = "[to read]" } @Book{Dijkstra:1976:DP, author = {Edsger W. Dijkstra}, title = {A Discipline of Programming}, publisher = {Prentice-Hall}, year = {1976}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {Great book.} } @Article{Lamport:1977:PCM, author = {Leslie Lamport}, title = {Proving the correctness of multiprocess programs}, journal = j-IEEE-TRANS-SOFTW-ENG, year = 1977, OPTkey = {}, volume = "3", number = 2, month = mar, pages = "125--143", annote = "First definition of terms ``safety'' and ``liveness''. What else?" } @InProceedings{Pnueli:1977:TLP, author = "Amir Pnueli", title = "The temporal logic of programs", booktitle = "Proceedings of the 18th IEEE Symposium on the Foundations of Computer Science (FOCS-77)", address = "Providence, Rhode Island", publisher = "IEEE Computer Society Press", organization = "IEEE", month = oct # " 31--" # nov # " 2", year = "1977", pages = "46--57", annote = "[to read] Presents the idea of reactive systems and temporal logic in contrast to transformationel systems using Hoare Logic." } @InProceedings{Bartlett:1978:ANO, author = "J. F. Bartlett", title = "A {``NonStop''} operating system", booktitle = "Proceedings of the 11th Hawaii International Conference on System Sciences", volume = "3", year = "1978", annote = "description of TANDEM system.", } @Article{Lamport:1978:TCO, author = {Leslie Lamport}, title = {Time, clocks and the ordering of events in a distributed system}, journal = j-CACM, year = 1978, OPTkey = {}, volume = {21}, number = {7}, month = jul, pages = {558--565}, OPTnote = {}, annote = "A famous and well-readable paper on causality and possible causal dependencies in distributed systems. Lamport is first to introduce the ``happended before'' relation (which corresponds to causality) and proposes the use of logical time instead of real time in distributed systems. He characterises the relation as being a partial order and shows how his logical time can be used to do mutual exclusion. Work has subsequently lead to vector time (Fidge/Mattern, cite?)." } @Article{Wensley:1978:SDA, author = "J. H. Wensley and L. Lamport and J. Goldberg and M. W. Green and K. N. Levitt and P. M. Melliar-Smith and R. E. Shostak and C. B. Weinstock", title = "{SIFT}: Design and analysis of a fault-tolerant computer for aircraft control", journal = "Proceedings of the IEEE", volume = "66", number = "10", month = oct, year = "1978", pages = "1240--1255", annote = "[to read]" } @InProceedings{Lamport:1980:SSN, author = "Leslie Lamport", title = "`{Sometimes}' is sometimes `not never'", booktitle = "Proceedings of SIGPLAN-80, 7th ACM Symposium on Principles of Programming Languages", address = "Las Vegas, Nevada", year = "1980", pages = "174--185", annote = "Discusses a difference between branching time and linear time notions of temporal logic. In linear time `not eventually $\neg\phi$' is equivalent to `always $\phi$'. This is not true in branching time. Lamport discusses the assumptions made by computer scientists about temporal properties: ``The logic of linear time was used by Pnueli [...], while the logic of branching time seems to be the one used by most computer scientists for reasoning about temporal concepts.'' As every paper by Lamport, extremely well readable stuff!" } @Article{Pease:1980:RAP, author = "M. Pease and R. Shostak and L. Lamport", title = "Reaching Agreements in the Presence of Faults", journal = "Journal of the ACM", volume = "27", number = "2", pages = "228--234", month = apr, year = "1980", annote = "This paper is similar to their 1982 publication \cite{Lamport:1982:BGP}, but contains a rigorous proof of the impossibility of Byzantine agreement for the case $n=3$, $t=1$. As usual, $n$ is the total number of processes and $t$ is the number of faulty processes.", } @Book{Burris:1981:CUA, author = {Stanley N. Burris and H. P. Sankappanavar}, ALTeditor = {}, title = {A course in universal algebra}, publisher = pub-SV, year = {1981}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, note = {Revised edition online at \url{http://thoralf.uwaterloo.ca/htdocs/ualg.html}}, OPTannote = {} } @Book{Gries:1981:SP, author = {David Gries}, title = {The Science of Programming}, publisher = pub-SV, year = {1981}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, OPTannote = {} } @Article{Pnueli:1981:TSC, author = {Amir Pnueli}, title = {The temporal semantics of concurrent programs}, journal = {Theoretical Computer Science}, year = {1981}, OPTkey = {}, volume = {13}, OPTnumber = {}, pages = {45--60}, OPTmonth = {}, OPTnote = {}, annote = {The semantics of a concurrent program specifies the set of execution sequences which are admissible as proper execution sequences of the program. Two main things must hold: (1) every state is obtained from its predecessor by execution a single enabled atomic action in one process, (2) no process which is infinitely often enabled will be infinitely often delayed (strong fairness). With this type of semantics one can introduce temporal operators ``always'' and ``eventually'' which can be used to precisely reformulate the usual program properties like termination, partial and total correctness, deadlock/starvation freedom etc. Also, proving that a program possesses some property reduces to proving a set inclusion. The logic still contains a ``next state'' operator which is argued against by Lamport in \cite{Lamport:1983:WGT} because it doesn't support hierachric proofs. Lamport regards this paper as the first to consider identifying programs with execution sequences and thus place programs and specifications onto the same formal level \cite{Abadi:1993:CS}.} } @TechReport{Rabin:1981:HES, author = "M. Rabin", title = "How to exchange secrets by oblivious transfer", institution = "Harvard Aiken Computation Laboratory", number = "TR-81", year = "1981", annote = "A probabilistic exchange protocol similar to \cite{Blum:1983:HES}. [to get]" } @Article{Chang:1982:EAD, author = {E. J.-H. Chang}, title = {Echo algorithms: {Depth} parallel operations on general graphs}, journal = j-IEEE-TRANS-SOFTW-ENG, year = {1982}, OPTkey = {}, volume = {SE-8}, OPTnumber = {}, pages = {391--401}, OPTmonth = {}, OPTnote = {}, annote = {[to get] Reference to Echo algorithm} } @InCollection{Girault:1982:PPC, author = {C. Girault}, title = {Proof of protocols in the case of failures}, booktitle = {Parallel processing systems. An advanced course}, OPTcrossref = {}, OPTkey = {}, pages = {121--139}, publisher = {Cambridge University Press}, year = {1982}, editor = {J. Evans}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTtype = {}, OPTchapter = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {[to read]} } @Article{Lamport:1982:BGP, author = "L. Lamport and R. Shostak and M. Pease", title = "The {Byzantine} generals problem", OPTcrossref = "", OPTkey = "", journal = j-TOPLAS, year = "1982", volume = "4", number = "3", pages = "382--401", month = jul, OPTnote = "", annote = "This is one of the all time classic papers in fault tolerant distributed computing: the Byzantine Generals Problem (BGP) is presented and scenarios are discussed where it is solvable and unsolvable. The BGP consists of a set of nodes in a completely connected network, one of which is called the commander and all others are lieutenants. There can be a certain number m of traitors in the set of nodes. The problem is that the commander sends an order to all lieutenants and (1) all lieutenants must obey the same order, and (2) if the commander is not a traitor then every other non-traitor obeys the order he sends. The real world scenarios where this problem exists are those where a set of replicated processors must act in unison despite the fact that all get different input (high reliability systems). It turns out that the problem is unsolvable if there are no more than 3m nodes in the network. If messages can be signed, then it remains unsolvable if half the nodes can be traitors. On the other hand, if there are 3m+1 nodes (or 2n+1 respectively), then the BGP is solvable. Two algorithms are given. They are presented and proved in a recursive/inductive fashion which is quite stunning. The authors remark, that the problem is unsolvable in asynchronous systems (where there is no possibility of implementing synchronized clocks in the presence of faults). Also, there algorthm for the 3m+1 case seems to be optimal although it requires a message path of m+1 and has a high message complexity. The authors argue that extremely high reliability has its cost. Byzantine behaviour is implicitly modeled by always choosing the worst choice, or considering all choices and choosing the worst." } @InProceedings{Ben-Or:1983:AAF, author = "Michael Ben-Or", title = "Another Advantage of Free Choice: Completely Asynchronous Agreement Protocols", booktitle = "Proc. Second Ann. ACM Symp. on Principles of Distributed Computing", year = "1983", pages = "27--30", annote = "Ben-Or's probabilistic algorithm for asynchronous Byzantine agreement, discussed in Section~\ref{sec-byzantine}, was one of the first published solution to the problem, and remains the simplest. Processes toss coins independently to reach consensus on a value. His algorithm requires that less than one-fifth of the processes are faulty for correctness to be guaranteed. The expected number of rounds is exponential in the number of processes $n$, but becomes a constant when the number of faulty processes is $O(\sqrt{n})$.", } @Article{Blum:1983:HES, author = "Manuel Blum", title = "How to Exchange (Secret) Keys", journal = "ACM Transactions on Computer Systems", volume = "1", number = "2", pages = "175--193", month = may, year = "1983", bibdate = "Thu Jan 14 11:57:59 1999", note = "Previously published in ACM STOC '83 proceedings, pages 440--447.", annote = "A protocol is presented to fairly exchange secrets using number theoretic means. Two parties, Alice and Bob, are assumed to have equal computing capabilities and equal knowledge of algorithms. There is no need for a trusted intermediary and no need for a judge outside of the system. There is a negligible probability of cheating. The idea is to use gradual exchange and after exchanging an individual bit, do some sort of zero-knowledge-proof to witness that the bit is actually a valid bit. This is done by a complicated challenge response type of method which I do not understand (quadratic residues, etc. involved). The probability that either can cheat the protocol can be made arbitrarily small. However, the usual problems with gradual exchange protocols still exist. Section 13 presents some interesting ideas regarding pricing of gradually exchanged bits. Claims to be similar to an early TR of Rabin \cite{Rabin:1981:HES}." } @Article{Lamport:1983:SCP, author = {Leslie Lamport}, title = {Specifying concurrent program modules}, journal = j-TOPLAS, year = {1983}, OPTkey = {}, OPTvolume = {5}, OPTnumber = {2}, OPTpages = {190--222}, OPTmonth = apr, OPTnote = {}, OPTannote = {to get} } @InProceedings{Lamport:1983:WGT, author = "{Leslie Lamport}", title = "What good is Temporal Logic?", booktitle = "Proceedings of the {IFIP} Congress on Information Processing", year = "1983", editor = "{R. E. A. Mason}", pages = "657--667", publisher = "North-Holland", address = "Amsterdam", annote = "This a more informal and easy going introduction into the merits of temporal logic than \cite{Lamport:1983:SCP}, much in the spirit of a later and more refined exposition \cite{Lamport:1989:SAS}. Lamport proposes a formal language because ``natural languages are very expressive and very imprecise'' while ``formal languages are not very expressive but very precise.'' The distinction is again that of safety and liveness properties, where safety properties can be used to reason about real-time behavior if the notion of a clock is added. The concept of stuttering is motivated and other temporal logic formalisms as of 1983 are briefly surveyed. Finally, Lamport elaborates on the hierarchy of programming languages, starting from high level specifications and ending at the quantum level of electrons. Temporal logic can provide a framework for reasoning at all these levels.", } @Article{Schlichting:1983:FSP, author = "Richard D. Schlichting and Fred B. Schneider", title = "Fail stop processors: {An} approach to designing fault-tolerant computing systems", OPTcrossref = "", OPTkey = "", journal = j-TOCS, year = "1983", volume = "1", number = "3", pages = "222--238", month = aug, OPTnote = "", annote = "[to read]" } @Book{Strohrmann:1983:AMM, author = {G. Strohrmann}, ALTeditor = {}, title = {{Anlagensicherung mit Mitteln der MSR-Technik}}, publisher = {Oldenburg}, year = {1983}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {M\"unchen}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {[Angabe von Armin]} } @InProceedings{Broder:1984:FCM, title = "Flipping coins in many pockets ({Byzantine} agreement on uniformly random values)", author = "Andrei Z. Broder and Danny Dolev", pages = "157--170", booktitle = "25th Annual Symposium on Foundations of Computer Science", month = "24--26 " # oct, year = "1984", address = "Singer Island, Florida", organization = "IEEE", annote = "Discusses randomized Byzantine agreement where a set of processes agree on a common bit using a random coin. Gives algorithm which works if the faulty processes are not the majority. Extends the impossibility result for deterministic consensus by showing that there is no Byzantine agreement protocol tolerant against $t$ fail-stop faults that works in less than $t+1$ rounds." } @Article{Dijkstra:1983:DTD, author = "Edsger W. Dijkstra and W. H. J. Feijen and A. J. M. {van Gasteren}", title = "Derivation of a Termination Detection Algorithm for Distributed Computations", journal = "Information Processing Letters", volume = "16", number = "5", pages = "217--219", day = "10", month = jun, year = "1983", coden = "IFPLAT", ISSN = "0020-0190", mrclass = "68B05 (68C05)", mrnumber = "84m:68005", bibdate = "Wed Nov 11 12:16:26 MST 1998", acknowledgement = ack-nhfb, classification = "723; B6210L (Computer communications); C5620 (Computer networks and techniques); C6150J (Operating systems)", corpsource = "Burroughs, AL Nuenen, Netherlands", journalabr = "Inf Process Lett", keywords = "computer programming; distributed computations; distributed processing; networks; protocols; termination detection algorithm", treatment = "P Practical", } @InCollection{Echtle:1984:FSV, author = "Klaus Echtle", title = "{Fehlermodellierung} bei {Simulation} und {Verifikation} {von} {Fehlertoleranz-Algorithmen} {f\"ur} {Verteilte} {Systeme}", OPTcrossref = "", OPTkey = "", booktitle = "{Software-Fehlertoleranz} und {-Zuverl\"assigkeit}", publisher = pub-SV, year = "1984", editor = "F. Belli and S. Pfleger and M. Seifert", OPTvolume = "", number = "83", series = "Informatik-Fachberichte", OPTtype = "", OPTchapter = "", pages = "73--88", OPTaddress = "", OPTedition = "", OPTmonth = "", OPTnote = "(in German)", annote = "Two types of fault models are described and compared: low level fault specifications (LLFS, `aufz{\"a}hlendes Fehlermodell') and high level fault specifications (HLFS, `spezifizierendes Fehlermodell'). LLFS consist of a detailed description of what type of faults may happen and when/where they are supposed to occur (e.g., send omission etc.). They are well suited for simulation and testing. HLFS are a high level description of how a node's behavior changes in the presence of faults. This is expressed at the interfaces between nodes of a distributed system: usually the actions at an interface reflect certain requirements of a protocol specifications. The occurence of faults at a node weaken these requirements. To an extreme (Byzantine behavior), there are no restrictions on what might happen at an interface. HLFS influence interface specifications and are only suited for verification purposes. Both LLFS and HLFS are compared according to their suitability for verification. Finally, the importance of hierarchic fault modelling is stressed to master complexity. This can be seen as an early predecessor of the concept of multitolerance \cite{Arora:1998:CDM}." } @PhdThesis{Hadzilacos:1984:IFT, author = "Vassos Hadzilacos", title = "Issues of Fault Tolerance in Concurrent Computations", school = "Harvard University", year = "1984", OPTcrossref = "", OPTkey = "", OPTaddress = "", OPTmonth = "", OPTtype = "", note = "also published as Technical Report TR11-84.", annote = "First mentioning of send omission type of faults. Reference found in \cite{Schneider:1993:WGM,Hadzilacos:1994:MAF}." } @Article{Lamport:1984:UTI, author = "Leslie Lamport", title = "Using Time Instead of Timeout for Fault-Tolerant Distributed Systems", journal = j-TOPLAS, volume = "6", number = "2", year = "1984", month = apr, annote = "[not by me:] processes are synchronized by clocks, and the clocks are synchronized using the Byzantine Generals solution. Time intervals are used. [to get]", } @Article{Lundelius:1984:ULB, title = "An Upper and Lower Bound for Clock Synchronization", author = "Jennifer Lundelius and Nancy Lynch", pages = "190--204", journal = "Information and Control", month = aug # "/" # sep, year = "1984", volume = "62", number = "2/3", annote = "Prove a result similar to \cite{Dolev:1986:PIA}: The clocks of $n$ processes cannot be deterministically synchronized more closely than $e(1-1/n)$, where $e$ is the un certainty of message delivery times. The assumptions are clocks running at the same speed but initialized differently, the given uncertainty $e$, and no failures. The graph is completely connected. The result states how close clock values can be at the same real time, wheras \cite{Dolev:1986:PIA} characterize how close the real times can be when clocks show the same value. The idea of the proof is to construct runs which look the same to the processes but result in different clock values/real times at different points." } @TechReport{Shah:1984:DSS, pages = "14", year = "1984", type = "Technical Report", number = "TR84-624", title = "Distributed Snapshots In Spite of Failures", author = "Amitabh Shah and Sam Toueg", abstract = "An extension of the Chandy-Lamport algorithm ([Chan84]) to find global states of distributed systems is presented where benign failures of processes and channels are permitted. The scope of the algorithm in detecting stable properties in distributed systems is discussed. As an application, an algorithm to detect deadlocks in failure-prone distributed systems is presented.", institution = "Cornell University, Computer Science Department", month = jul, notes = "Revised February 1985", annote = "Extends the Chandy-Lamport snapshot algorithm \cite{Chandy:1985:DSD} to deal with crash-recover faults and message losses. The system model is the asynchronous one of \cite{Chandy:1985:DSD} and the algorithm uses a simple timeout mechanism to check the functional state of neighboring processes (today we call this an unreliable failure detector). Channels are FIFO and flushing messages are used just like in \cite{Chandy:1985:DSD}. However, due to obvious impossibilities the notion of a consistent cut must be weakened in a way that includes uncertainty. Termination is guaranteed by the timeout solution, but the result may be `uncertain' making it necessary to restart the algorithm again. It doesn't seem to be guaranteed that eventually a stable predicate is detected because of possible channel failures (what about false suspicions and virtual partitions?). Has this algorithm been published elsewhere?" } @Article{Spector:1984:SSP, author = {Alfred Spector and David Gifford}, title = {The space shuttle primary computer system}, journal = j-CACM, year = 1984, OPTkey = {}, volume = 27, number = 9, OPTmonth = {}, pages = {874--900}, OPTnote = {}, annote = {A detailed description of the computer system that runs the space shuttle.} } @Article{Alpern:1985:DL, author = {Bowen Alpern and Fred B. Schneider}, title = {Defining liveness}, journal = j-IPL, year = 1985, OPTkey = {}, volume = 21, OPTnumber = {}, OPTmonth = {}, pages = "181--185", OPTnote = {}, annote = "Standard definitions of system properties, safety and liveness. Shows that every nontrivial system property can be expressed as an intersection of a safety property and a liveness property. Terms safety and liveness defined by Lamport \cite{Lamport:1977:PCM}." } @Article{Bracha:1985:ACB, author = "Gabriel Bracha and Sam Toueg", title = "Asynchronous Consensus and Broadcast Protocols", OPTcrossref = "", OPTkey = "", journal = j-ACM, year = "1985", volume = "32", number = "4", pages = "824--840", month = oct, OPTnote = "", annote = "The authors investigate probabilistic consensus protocols for ``FLP'' model \cite{Fischer:1985:IDC}. Probabilities are introduced by making assumptions about the message subsystem, i.e. the probability that a node receives a message from all non-faulty nodes can be calculated. For the fail-stop model half of the nodes may be faulty to still achieve consensus with probability 1, for Byzantine faults at most one third may be faulty. The relevant protocols and an application to reliable broadcast are given." } @Article{Chandy:1985:DSD, author = {K. M. Chandy and Leslie Lamport}, title = {Distributed snapshots: determining global states of distributed systems}, journal = {ACM Transactions on Computing Systems}, year = {1985}, OPTkey = {}, volume = {3}, number = {1}, OPTmonth = {}, pages = {63--75}, OPTnote = {}, annote = {nicht kopiert} } @InProceedings{Coan:1985:DFS, title = "The Distributed Firing Squad Problem (Preliminary Version)", author = "Brian A. Coan and Danny Dolev and Cynthia Dwork and Larry Stockmeyer", pages = "335--345", booktitle = "Proceedings of the Seventeenth Annual {ACM} Symposium on Theory of Computing", month = "6--8 " # may, year = "1985", address = "Providence, Rhode Island", annote = "[to read]" } @Article{Cristian:1985:RAF, author = {Flaviu Cristian}, title = {A rigorous approach to fault-tolerant programming}, journal = j-IEEE-TRANS-SOFTW-ENG, year = 1985, OPTkey = {}, volume = 11, number = 1, month = jan, pages = "23--31", OPTnote = {}, annote = "First idea of defining faults as spontaneous actions on an extended system space." } @Article{Fischer:1985:IDC, author = {Michael J. Fischer and Nancy A. Lynch and Michael S. Paterson}, title = {Impossibility of distributed consensus with one faulty process}, journal = j-ACM, year = 1985, volume = 32, number = 2, month = apr, pages = "374--382", OPTnote = {}, OPTannote = "Landmark paper in fault-tolerant distributed computing. The system considered is completely asynchronous, nodes may stop by halting (crash failure) but may not exhibit hostile (Byzantine) behaviour, the message system is reliable with a reliable broadcast primitive, no synchronized clocks or the possibility to detect failures are assumed. The authors show that every non-trivial execution can go on forever without reaching a result, because it is in effect not possible to distinguish a crashed node from one that is merely very slow. The proof is very detailed and is based non non-constructive methods that produce a contradiction from opposite assumptions. (Proof is explained in other words in \cite{Turek:1992:MFC}.)" } @Article{Halpern:1985:OPP, title = "Optimal Precision in the Presence of Uncertainty", author = "Joseph Y. Halpern and Nimrod Megiddo and Ashfaq A. Munshi", pages = "170--196", journal = "Journal of Complexity", year = "1985", month = dec, volume = "1", number = "2", annote = "Analyzes the imprecision inherent in distributed systems that have uncertain message delays. Takes the model of \cite{Dolev:1986:PIA} and wants to execute coordinated actions (instead of doing clock synchronization). Assumes that hardware clocks run at the same rate, yet may be initialized differently, and that messages have a maximum delivery delay. Basically enriches the lower bound of \cite{Dolev:1986:PIA}, and states that probabilistic algorithms can do no better (with certainty). Hmm, see \cite{Cristian:1989:PCS}. Investigates the situation in which there a Byzantine nodes. Gives an algothm to compute optimal precision in cases without faults and bounded precision in cases with faults." } @Book{Hoare:1984:CSP, author = "C. A. R. Hoare", title = "Communicating Sequential Processes", publisher = "Prentice-Hall", year = "1985", } @Article{Awerbuch:1985:CNS, author = {Baruch Awerbuch}, title = {Complexity of Network Synchronization}, journal = {Journal of the ACM}, year = {1985}, OPTkey = {}, volume = {32}, number = {4}, pages = {804--823}, month = oct, OPTnote = {}, OPTannote = {} } @InProceedings{Laprie:1985:DCF, author = "J. C. Laprie", title = "Dependable computing and fault tolerance: concepts and terminology", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "2--11", booktitle = pro-ftcs85, year = "1985", OPTorganization = "", OPTpublisher = "", OPTaddress = "", month = jun, OPTnote = "", OPTannote = "[to read]" } @Article{Lamport:1985:SCP, author = "Leslie Lamport and P. M. Melliar-Smith", title = "Synchronizing Clocks in the Presence of Faults", journal = "Journal of the ACM", volume = "32", number = "1", pages = "52--78", month = jan, year = "1985", url = "http://www.acm.org/pubs/toc/Abstracts/0004-5411/2457.html", abstract = "Algorithms are described for maintaining clock synchrony in a distributed multiprocess system where each process has its own clock. These algorithms work in the presence of arbitrary clock or process failures, including ``two-faced clocks'' that present different values to different processes. Two of the algorithms require that fewer than one-third of the processes be faulty. A third algorithm works if fewer than half the processes are faulty, but requires digital signatures.", keywords = "algorithms; Byzantine failures; clocks, electric --- Synchronization; computer programming --- Algorithms; computer systems programming --- Multiprocessing Programs; computer systems, digital; Fault Tolerant Capability; interactive convergence algorithm; reliability; theory; verification; Zeitliche Ordnung", annote = "investigates Byzantine clock synchronization. Surveyed in \cite{Ramanathan:1990:FCS}. [to get]" } @Article{Arora:1986:DTD, author = {Rada Krishan Arora and S. P. Rana and M. N. Gupta}, title = {Distributed termination detection algorithm for distributed computations}, journal = ipl, year = 1986, OPTkey = {}, volume = 22, OPTnumber = {}, month = "May", pages = "311--314", annote = "See also \cite{Tan:1986:CDT,Arora:1988:MCD}." } @Article{Berglund:1986:IV, author = "Eric J. Berglund", title = "An introduction to the {V}-system", journal = "IEEE Micro", volume = "6", number = "4", pages = "35--52", month = aug, year = "1986", annote = "[to read]" } @Article{Chandy:1986:HPL, author = "K. M. Chandy and Jayadev Misra", title = "How processes learn", OPTcrossref = "", OPTkey = "", journal = j-DC, year = "1986", volume = "1", OPTnumber = "", pages = "40--52", OPTmonth = "", OPTnote = "", annote = "A formal article on knowledge of processes and how it is gained and lost. The notion of knowledge is defined using the concept of isomorphism. Two system computations are isomorphic with respect to a process if the behaviour of the process is identical in both computations. This means essentially that ``a process cannot distinguish between them''. A fact that is valid in all indistinguishable computations is said to be known by a process. An important type of predicate is a local predicate (which is affected merely by state changes on one process). These results can be applied to situations in which the question is asked: Is a process unsure about a fact? These scenarios include the impossibility to detect whether a process has crashed. The theory is also applied to show that there must be causal message chains in mutual exclusion protocols and that the complexity of termination detection is at least as large as the message complexity of the underlying computation." } @InProceedings{Cleve:1986:LSC, title = "Limits on the Security of Coin Flips when Half the Processors Are Faulty (Extended Abstract)", author = "Richard Cleve", pages = "364--369", booktitle = "Proceedings of the Eighteenth Annual {ACM} Symposium on Theory of Computing", month = "28--30 " # may, year = "1986", address = "Berkeley, California", annote = "The 2-processor-bit-selection problem is to devise a protocol between two processes $A$ and $B$ with the following properties: $A$ and $B$ start with a random bit value and after termination of the protocol both processes output a value $a$ and $b$, respectively, where $a=b$ (agreement). Processes internally have access to a random variable. A weaker definition of agreement states, that the probability that $a=b$ must be bounded from below by $1-O(1/n^k)$ where $n$ and $k$ are not clear to me. A 2-processor-bit-selection-scheme is secure if the protocol achieves the weaker definition of agreement (or better) even in the case where one process is replaced by a faulty one. The author gives an impossibility result stating that there exists no secure 2-processor-bit-selection protocol (Section 2.2). (I didn't get the idea behind the proof.) This result is extended to a definition of an $s$-processor-bit-selection scheme. The new result states that it is impossible to reach (weak) agreement if $\lceil s/2 \rceil$ of the processors are faulty. The paper must be seen in the context of probabilistic Byzantine agreement, I suppose." } @InProceedings{Cristian:1986:CSP, author = "F. Cristian and H. Aghili and R. Strong", title = "Clock Synchronization in the Presence of Omission and Performance Faults", booktitle = pro-ftcs86, pages = "218--223", publisher = pub-IEEE, address = "Vienna, Austria", year = "1986", annote = "Revised version read as \cite{Cristian:1994:CSP}." } @Article{Dolev:1986:PIA, author = {Danny Dolev and Joseph Y. Halpern and H. Raymond Strong}, title = {On the possibility and impossibility of achieving clock synchronization}, journal = {Journal of Computer and System Sciences}, year = {1986}, OPTkey = {}, volume = {32}, number = {2}, pages = {230--250}, month = apr, OPTnote = {}, annote = {The authors prove that clock synchronization is impossible without authentication if at least one third of the processors are faulty. They also give a lower bound on the precision of local clocks: Define $U$ to be the maximum uncertainty in the network, i.e. the maximum difference between minimum and maximum message transmission time for any pair of directly connected processes. The imprecision of local clocks is at least half the uncertainty, i.e. there is no algorithm that synchronizes clocks of two adjacent processes closer than $U/2$. An extended result appears in \cite{Halpern:1985:OPP}. A result similar to this can be found in \cite{Lundelius:1984:ULB} (see the discussion there).} } @Article{Dolev:1986:RAA, author = "Danny Dolev and Nancy A. Lynch and Shlomit S. Pinter and Eugene W. Stark and William E. Weihl", title = "Reaching approximate agreement in the presence of faults", OPTcrossref = "", OPTkey = "", journal = j-ACM, year = "1986", volume = "33", number = "3", pages = "499--516", month = jul, OPTnote = "", annote = "[to read]" } @Article{Fischer:1986:EIP, author = "Michael J. Fischer and Nancy A. Lynch and Michael S. Paterson", title = "Easy impossibility proofs for distributed consensus problems", OPTcrossref = "", OPTkey = "", journal = j-DC, year = "1986", volume = "1", OPTnumber = "", pages = "26--39", OPTmonth = "", OPTnote = "", annote = "[to read]" } @Book{Francez:1986:F, author = "Nissim Francez", title = "Fairness", publisher = pub-SV, series = "Texts and Monographs in Computer Science", year = "1986", annote = "A thoughrough book on many notions of fairness in scheduling concurrent actions. Possibly outdated because many new notions seem to have appeared (any references?" } @Article{Liskov:1986:SDP, author = "Barbara Liskov and William Weihl", title = "Specifications of Distributed Programs", journal = j-DC, publisher = pub-SV, year = "1986", volume = "1", pages = "102--118", annote = "An early advocate of having two seperate sets of specifications: one for the normal operation and a weaker one for ``abnormal'' behavior (the tolerance specification of \cite{Gaertner:1999:ESD}). The authors argue that this is user friendly and also simplifies the specifications. Several examples of such specifications are given (which I did not look at in detail). The conclusions contain a somewhat misleading discussion on why liveness is not the correct property to describe abnormal behavior. Rather, the likelihood of abnormal behavior should be specified (but this is a point of future work). At the end, the authors indicate that having a tolerance specification eases the understanding of implementation constraints and so a tolerance specification is also of use to implementors. The tolerance specification can be seen as the ``first refinement'' of the original specification.", } @Article{Moses:1986:CHO, author = "Yoram Moses and Danny Dolev and Joseph Y. Halpern", title = "Cheating husbands and other stories: {A} case study of knowledge, action, and communication", OPTcrossref = "", OPTkey = "", journal = j-DC, year = "1986", volume = "1", OPTnumber = "", pages = "167--176", OPTmonth = "", OPTnote = "", annote = "The authors again take the cheating husbands puzzle to show subtle interactions between knowledge, action, and communication in distributed systems. They discuss the cases of asynchronous communication, synchronous communication, weakly synchronous communication with bound b, and asymmetry in communication (ring topology). The relationship to eventual common knowledge, common knowledge and b-common knowledge are shown. Moreover, in the synchronous case, faulty nodes can compilcate the matter again (disobedient wives). This paper is shorter and thus a little more introductory that a later one \cite{Halpern:1990:KCK}." } @Article{Myers:1986:CSF, author = {W. Myers}, title = {Can software for the strategic defense initiative ever be error free?}, journal = {IEEE Computer}, year = {1986}, OPTkey = {}, volume = {19}, number = {11}, OPTpages = {}, month = nov, OPTnote = {}, annote = {Presents figure that there are about 3.3 software errors per 1000 LoC. Peter G. Neumann comments on this in `Inside Risks' in late 2000 CACM.} } @Article{Perry:1986:DAP, title = "Distributed Agreement in the Presence of Processor and Communication Faults", author = "Kenneth J. Perry and Sam Toueg", journal = j-IEEE-TRANS-SOFTW-ENG, pages = "477--482", month = mar, year = "1986", volume = "12", number = "3", annote = "First to define the general omission fault model consisting of crash, send- and receive-omission failures. [to get]" } @Article{Tan:1986:CDT, author = {Richard B. Tan and Gerard Tel and Jan {van Leeuwen}}, title = {Comments on {``Distributed termination detection algorithm for distributed computations''} ({Letter} to the {Editor})}, journal = ipl, year = 1986, OPTkey = {}, volume = 23, OPTnumber = {}, month = "October", pages = "163", annote = "Notes an error in the algorithm of \cite{Arora:1986:DTD}. See also \cite{Arora:1988:MCD}." } @InProceedings{Attiya:1987:ACA, author = "Hagit Attiya and Amotz Bar-Noy and Danny Dolev and Daphne Koller and David Peleg and R{\"u}diger Reischuk", title = "Achievable cases in an asynchronous environment", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "337--346", booktitle = "Proceedings of the 28th annual Symposium on the Foundations of Computer Science", year = "1987", OPTorganization = "", publisher = pub-IEEE-CSP, OPTaddress = pub-IEEE-CSP:adr, month = oct, OPTnote = "", annote = "The authors consider several problems and show that they are achievable in asynchronous systems despite that fact that things like consensus aren't. Problems considered are renaming of processors to compact the name space and the ``multi-slot critical section problem'' (which is multual exclusion for more than one processor." } @Book{Bernstein:1987:CCR, author = {P. Bernstein and V. Hadzilacos and N. Goodman}, title = {Concurrency Control and Recovery in Database Systems}, publisher = pub-AW, year = {1987}, OPTnote = {}, OPTannote = {H.2.5/Bern nicht am Ort} } @article{Birman:1987:RCP, author ={K.P. Birman and T.A. Joseph}, title ={Reliable Communication in the Presence of Failures}, journal ={ACM Transactions on Computer Systems}, volume ={5}, number ={1}, month =feb, year ={1995}, pages ={47--76}, annote ={First reference to causal order, the generalization of Lamport's happened-before \cite{Lamport:1978:TCO}.} } @Article{Brooks:1987:NSB, author = "Frederick P. Brooks", title = "No Silver Bullet", journal = j-IEEE-COMPUTER, volume = "20", number = "4", pages = "10--19", month = apr, year = "1987", annote = "A famous paper on the ``essence and accidents in software engineering''. Brooks explores reasons for the fact that despite high hopes and great claims the software industry and computer science academia has failed to produce really reliable, error-free products. Brooks discusses facts like complexity and psychological problems for people involved. He states that the problem is in it's core rather a human than a technical issue." } @Article{Dolev:1987:MSN, author = "Danny Dolev and Cynthia Dwork and Larry Stockmeyer", title = "On the minimal synchronism needed for distributed consensus", OPTcrossref = "", OPTkey = "", journal = j-ACM, year = "1987", volume = "34", number = "1", pages = "77--97", month = jan, OPTnote = "", annote = "This is a refinement work of the paper by Fischer, Lynch and Paterson \cite{Fischer:1985:IDC}. The consensus problem is investigated in various different system models. Critical parameters that emerge are: processors synchronous/asynchronous, communication synchronous/asynchronous, message order synchronous/asynchronous, broadcast transmission or point to point, atomic receive/send or separate receive/send. The minimal cases to achieve consensus are: (1) synchronous processors and synchronous communication, (2) synchronous processors and synchronous message order, (3) synchronous message order and broadcast communication, (4) synchronous communication, broadcast transmission, and atomic receive/send. The intuition behind the results is that the system shouldn't be able to ``hide a critical step''. Probabilistic algorithms are not investigated. The proofs in this paper are large and quite intrinsic." } @Article{Jifeng:1987:ASP, title = "Algebraic Specification and Proof of a Distributed Recovery Algorithm", author = "He Jifeng and C. A. R. Hoare", journal = "Distributed Computing", pages = "1--12", year = "1987", volume = "2", number = "1", annote = "A masking fault tolerant implementation to a crash-recover process is presented and proved using basics of CSP. Two different implementations are presented: one that simply replays and one that uses checkpoints. Faults are detected instantaneously and ``the only subtle point is to ensure the correct outcome even when [faults] occur in the middle of the recovery procedure.'' (p. 2) I didn't find this point though in the proof. The discussion makes a few good points and contributes to the overall quality of the paper: (1) instead of having a general purpose mechanism to prove any system, every application area probably will have its adopted calculus: ``Nevertheless, even for a grossly over-simlified problem, the algebraic calculations are non-trivial. This probably jas to be accepted as inevitable in any serious application of mathematics to engineering. The calculations can be simplified by prior development of a calculus adapted more to the specifica needs of a problem. It will be interesting to see how far such calculi are applicable to mor general classes of problems; but it seems quite likely that they will not. Again, we may have to accept that each application will require derivation of specialized laws to control its complexity.'' (page 9) (2) recovery with non-instantaneous fault detection can probably be based on logical time, (3) non-deterministic processes cannot use this type of recovery. A weakened specification is necessary." } @Article{Joseph:1987:PRF, title = "Proof Rules for Fault Tolerant Distributed Programs", author = "Mathai Joseph and Abha Moitra and Neelam Soundararajan", pages = "43--67", journal = "Science of Computer Programming", month = feb, year = "1987", volume = "8", number = "1", annote= "The authors attempt to develop a set of rules to prove the correctness of CSP programs \cite{Hoare:1984:CSP} in faulty environments. The failure model is that of fail-stop, i.e. the authors assume detectable crash faults and recovery without stable storage. The method concentrates on partial correctness of terminating processes and also on the invariants of non-terminating processes (i.e. it concentrates on safety properties). The proof rules show how the interface (i.e. communication) behavior of processes is weakened by the failure model and how the behavior of the complete system can be obtained from the behaviors of the individual processes. The recovery-aspect of the failure model weakens the achievable safety property because of possible repetitions. But overall, the global invariant is the conjunction of the local invariants provided that the processes are ``compatible'' (meaning that their communication behavior matches). Channels are assumed to be reliable. Sect. 6 contains the first derivation rule of weaker safety properties that I know of. A bounded buffer is taken as an example. The future work section discusses general liveness properties and states that they are difficult to prove! The basic fault-tolerance methodology involved here is based on detection (which is assumed to be automatic) and correction through recovery actions. This is the basis of later work in this direction \cite{Peled:1994:CFF,Arora:1998:CDM,Arora:1998:DCT}." } @Article{Mattern:1987:ADT, author = {Friedemann Mattern}, title = {Algorithms for distributed termination detection}, journal = j-DC, year = {1987}, OPTkey = {}, volume = {2}, number = {3}, pages = {161-175}, OPTmonth = {}, OPTnote = {}, annote = {[to get]} } @Article{Moran:1987:EIR, author = "Shlomo Moran and Yaron Wolfstahl", title = "Extended impossibility results for asynchronous complete networks", journal = "Information Processing Letters", volume = "26", number = "3", pages = "145--151", day = "23", month = nov, year = "1987", affiliationaddress = "Technion Israel Inst of Technology, Haifa, Isr", journalabr = "Inf Process Lett", keywords = "asynchronous complete networks; computational complexity; computer fault tolerance; computer systems, digital; consensus problem; consensus task; decision graph; Distributed; distributed computation; distributed processing; fault tolerance; fault tolerant computing; graph theory; impossibility results; mathematical techniques --- Graph Theory; protocol; protocols; reliability; standardization; theory; unsolvability; verification", annote = "[to read]" } @Book{Raynal:1987:NDC, author = "Michel Raynal", title = "Networks and Distributed Computation: Concepts, Tools, and Algorithms", publisher = "North Oxford Academic Publishers", address = "London", year = "1987", keywords = "book, text, parallel processing, supercomputers,", ISBN = "0-946536-27-9", note = "Original French language edition Systemes repartis et reseaux (1987), translated by Meg Sanders", } @Article{Srikanth:1987:OCS, author = "T. K. Srikanth and Sam Toueg", title = "Optimal Clock Synchronization", journal = J-ACM, volume = "34", number = "3", pages = "626--645", month = jul, year = "1987", url = "http://www.acm.org/pubs/toc/Abstracts/0004-5411/28876.html", abstract = "We present a simple, efficient, and unified solution to the problems of synchronizing, initializing, and integrating clocks for systems with different types of failures: crash, omission, and arbitrary failures with and without message authentication. This is the first known solution that achieves optimal accuracy - the accuracy of synchronized clocks (with respect to real time) is as good as that specified for the underlying hardware clocks. The solution is also optimal with respect to the number of faulty processes that can be tolerated to achieve this accuracy.", keywords = "algorithms; Byzantine failures; computer programming --- Algorithms; computer systems, digital; Distributed; message authentication; optimal clock synchronization; reliability; synchronizing in presence of faults; theory; verification", annote = "Assumes that drist rate is bounded on processes and that there is a maximum message delivery delay. Gives tolerance specification of clock synchronization. Shows lower bound on accuracy dependent on the drift rate of clocks: the bound on the drift rate of logical clocks is at least as large as the bound of drift of the physical clocks (Theorem 2). They present an algorithm which reaches this bound." } @Article{Apt:1988:AFL, title = "Appraising Fairness in Languages for Distributed Programming", author = "Krzysztof R. Apt and Nissim Francez and Shmuel Katz", journal = "Distributed Computing", pages = "226--241", year = "1988", volume = "2", number = "4", annote = "A general formulation of fairness is: if a certain choice is possible infinitely often, then it is sufficiently often taken. Precise formulations depend on how `choice', `possible' and `sufficiently often' are defined. The authors propose three basic criteria which any sensible definition of fairness should have in any model: feasibility, equivalence robustness, and liveness enhancement. (a) Fairness usually rules out certain traces which would be acceptable in the given model of computation. Feasibility ensures that after ruling out unfair traces, still valid traces remain. More precisely, feasibility requires that for every point in a computation it should be possible to extend it to be a fair one. This is related to the notion of machine closure \cite{Lamport:2000:FAH}. (b) Equivalence robustness means that if a trace x is fair, then a trace y must also be fair where y results from x by resorting `independent' actions. (c) Liveness enhancement means that all distributed system models assume a fundamental liveness property, meaning for example that eventually the system will take a step if it is not deadlocked. A fairness definition must give `additional value' to such an assumption, i.e., there must be a program which has a liveness property only if the additional fairness requirement holds." } @Article{Arora:1988:MCD, author = {Rada Krishan Arora and M. N. Gupta}, title = {More comments on {``Distributed termination detection algorithm for distributed computations''} ({Letter} to the {Editor})}, journal = ipl, year = 1988, OPTkey = {}, volume = 29, OPTnumber = {}, month = {September}, pages = {53--55}, annote = "See also \cite{Arora:1986:DTD,Tan:1986:CDT}. Tries to fix the error." } @Book{Chandy:1988:PPD, author = "K. Mani Chandy and Jayadev Misra", title = "Parallel Program Design: {A} Foundation", publisher = pub-AW, address = "Reading, Mass.", year = "1988", annote = "[to read]" } @Article{Dwork:1988:CPP, author = "Cynthia Dwork and Nancy Lynch and Larry Stockmeyer", title = "Consensus in the presence of partial synchrony", OPTcrossref = "", OPTkey = "", journal = j-ACM, year = "1988", volume = "35", number = "2", pages = "288--323", month = apr, OPTnote = "", annote = "The authors study practically motivated models of synchrony that lie between fully asynchronous and fully synchronous systems in which consensus shall be achieved. The models of partial synchrony studied include: (1) upper bounds on processor speeds and message latency exist but are unknown, and (2) upper bounds exists and are known, but only hold after some unknown time (eventually). In both cases consensus with different resiliency can be achieved." } @InProceedings{Fidge:1988:TMP, author = {Colin J. Fidge}, title = {Timestamps in message-passing systems that preserve partial ordering}, booktitle = {Proceedings of the 11th Australian Computer Science Conference}, OPTcrossref = {}, OPTkey = {}, pages = {56--66}, year = {1988}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, month = feb, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, OPTannote = {} } @InProceedings{Haban:1988:GEG, author = "Dieter Haban and Wolfgang Weigel", title = "Global Events and Global Breakpoints in Distributed Systems", booktitle = "Proceedings of the Twenty-First Annual Hawaii International Conference on System Sciences", year = "1988", month = jan, pages = "166--175", editor = "Bruce D. Schriver", volume = "II (Software Track)", publisher = pub-IEEE, annote = "[to read]" } @InProceedings{Herlihy:1988:RAO, author = "Maurice P. Herlihy and Jeannette M. Wing", title = "Reasoning about Atomic Objects", pages = "193--208", ISBN = "3-540-50302-1", editor = "M. Joseph", booktitle = "Proceedings of the Symposium on Formal Techniques in Real-Time and Fault-Tolerant Systems", month = sep, series = ser-LNCS, volume = "331", publisher = pub-SV, year = "1988", annote = "formal proof method for fault tolerant programs, to read" } @Book{Isermann:1988:DRS, author = {Rolf Isermann}, title = {{Digitale Regelsysteme, Band I (in German)}}, publisher = pub-SV, year = {1988}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {[to read]} } @ARTICLE{Kessels:1988:EPS, AUTHOR = "J. L. W. Kessels", TITLE = "An exercise in proving self-stabilization with a variant function", JOURNAL = j-IPL, VOLUME = 29, YEAR = 1988, PAGES = "39--42", annote = "Correctness proof of Dijkstra's 3-state mutual exclusion protocol \cite{Dijkstra:1974:SSS} using a bound function. It shows the general technique of proving convergence by a variant function and also exposes the intrinsic dangers and difficulties of this method." } @Article{Knuth:1988:SDS, author = {T. Knuth}, title = {{Schadenfr\"uherkennung durch Schwingungsanalysen --- Neue M\"oglichkeiten in der Instandhaltung}}, journal = {Der Maschinenschaden}, year = {1988}, OPTkey = {}, volume = {61}, OPTnumber = {}, pages = {70--74}, OPTmonth = {}, OPTnote = {}, annote = {[Angabe von Armin]} } @InCollection{Mancini:1988:TTR, author = "Luigi V. Mancini and Guiseppe Pappalardo", editor = "M. J. Warick", title = "Towards a theory of replicated processing", booktitle = "Formal techniques in real-time and fault-tolerant systems", series = ser-LNCS, volume = "331", publisher = pub-SV, year = "1988", annote = "specification approach [to read]" } @InProceedings{Mattern:1988:VTG-CITE-1989-VERSION, author = {Friedemann Mattern}, title = {Virtual time and global states of distributed systems}, booktitle = {Proceedings of the International Workshop on Parallel and Distributed Algorithms}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {1988}, editor = {M. Cosnard}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Chateau de Bonas, France}, OPTmonth = oct, OPTorganization = {}, publisher = {Elsevier}, OPTnote = {}, annote = {Reprinted somewhere, but where?} } @InProceedings{Miller:1988:BHD, author = "Barton P. Miller and Jong-Deok Choi", title = "Breakpoints and halting in distributed programs", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "316--323", booktitle = "Proceedings of the 8th International Conference on Distributed Computing Systems", year = "1988", OPTorganization = "", OPTpublisher = "", OPTaddress = "", OPTmonth = "", OPTnote = "", annote = "The authors address the problem of distributed debugging by formally defining distributed predicates and giving algorithms to detect such predicates and to halt a distributed computation in a consistent state once the predicate is satisfied. Predicates are either (1) simple predicates (defined over the state of a single process), (2) disjunctions of simple predicates, (3) linked predicates (a sequence of events ordered by the causality relation) and (4) conjunctions of simple predicates. Types (1) and (2) are easily detectable by detection modules within processes. Type (3) is detected by tracking event occurrences on the processes involved in each item on the predicate chain and following causal dependencies across channels with markers. Type (4) needs a definition of its semantics since there is no single notion of time: $A\land B$ is true whenever $A$ becomes true on one process and $B$ subsequently becomes true on another process and $B$ causally depends on $A$. If no causal relationship exists between $A$ and $B$, then a central observer is used to detect $A\land B$. An algorithm to detect these predicates and to halt the algorithm is given based on the Chandy/Lamport algorithm to observe computations (requires FIFO channels, but asynchronous system). The paper describes problems related to distributed debugging. The detection of conjunctions implicitly defines ``possibly($A\land B$)'' without stating how to detect this in all cases in a distributed fashion. But this is okay, since this paper rather aims at detecting dynamic properties, and possibly is a static property. See also a good continuation of this work by Babaoglu et al. \cite{Babaoglu:1996:UFS}." } @InProceedings{Patterson:1988:CRA, author = "David A. Patterson and Garth Gibson and Randy H. Katz", title = "{A} {C}ase for {R}edundant {A}rrays of {I}nexpensive {D}isks ({RAID})", booktitle = "Proceedings of the ACM Conference on Management of Data (SIGMOD)", year = "1988", month = jun, OPTaddress = "Chicago, IL", pages = "109--116", abstract = "As processor and memory speeds increase at an exponential rate and single disk access times remain relatively constant, it is apparent that I/O bandwidth is likely to become a bottleneck in the performance of systems. One way to address this problem is by using disk arrays, i.e., sets of relatively inexpensive disks which can improve I/O bandwidth via parallel access. The problem with this approach is that simply using disk arrays can drastically reduce reliability. The approach of RAID is to use redundant disks of check data to bring reliability up to acceptable levels (i.e., failure rates better than expected useful life of the disks). Five levels of the RAID design are presented to address the issues of overhead cost (in terms of number of disks), useable storage capacity, and efficiency per disk for various read and write scenarios (i.e, large vs. small). These issues were considered in terms of {\em data rates} (supercomputer applications) and {\em I/O rates} (transaction systems). Level 5 RAID provides the best all around performance by distributing check data across the data disks to increase parallelism.", annote = "to read" } @Book{Raynal:1988:DAP, author = "Michel Raynal", title = "Distributed Algorithms and Protocols", series = "Wiley Series in Computing", pages = "163", publisher = "John Wiley \& Sons", address = "Chichester, England", year = "1988", keywords = "book, text, parallel processing, supercomputers, electronic data processing -- distributed processing, algorithms, computer network protocols", ISBN = "0-471-91754-0", abstract = "More theoretical book on the fundamental problems in distributed systems and some solutions. 1st. English issue 1988 (the French version was published in 1985).\par ** Description ** The use of distributed algorithms offers the prospect of great advances in computing speed. This book provides a clear, practical, and up-to-date guide to distributed algorithms and protocols in the area of control. Much of the material has been heretofore unavailable in English. Each chapter considers a specific aspect of control, with an analysis of the problem, a description of the algorithm for solving it, and proofs of correctness. Chapters can be studied independently to find solutions to particular problems.\par ** Contents ** Introduction to Distributed Algorithms. Election and Mutual Exclusion Algorithms. Algorithms for Detection and Resolution of Deadlock. Algorithms for Detecting Termination. Protocols for Data Transfer. Management of Distributed Data. Problems of Gaining Concensus in the Presence of Uncertainties (or How to Avoid Byzantine Quarrels). References.", note = "Algorithmes distribues et protocoles, translated by Jack Howlett", } @InProceedings{Abadi:1989:RUS, author = {Mart{\'\i}n Abadi and Leslie Lamport and Pierre Wolper}, title = {Realizable and unrealizable specifications of reactive systems}, booktitle = {Automata, Languages and Programming. 16th Int.~Colloquium Proceedings}, OPTcrossref = {}, OPTkey = {}, ages = {1--17}, year = {1989}, editor = {G. Ausiello and M. Dezani-Ciancaglini and S. Ronchi Della Rocca}, OPTvolume = {}, number = {372}, series = ser-LNCS, address = {Stresa, Italy}, month = jul, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {A specification is a formula $E\Rightarrow M$ where $E$ is an assumption about the environment and $M$ is a property guaranteed by the system (this way of viewing specifications is described in \cite{Lamport:1989:SAS,Abadi:1993:CS}). Specifications can become unrealizable if $E$ asserts some property of the environment because this part of the universe is totally outside the control of the implementor. Thus, a specification is unrealizable if it constrains the environment. This paper studies the exact definitions and conditions of realizability. The first approach is to define a simple computer and base the definition of realizability on the fact that a specification can be implemented on such a device. On the other hand, it views a specification as the rules for a two-player infinite game where environment and system both take turns and try to win. The environment wins if it can produce unspecified executions. Otherwise the system wins. It turns out that a specification is realizable if the system has a winning strategy. Realizability of a specification is a different notion than consistency (i.e., whether the set of infinite behaviors of the system is nonempty). This paper is very theoretical and uses a lot of terminology and concepts that I am not familiar with (B\"uchi automata, Borel sets, etc). The ideas of game-playing and specifications appear again in \cite{Abadi:1993:CS} in an at least to me more understandable fashion. } } @InProceedings{Chor:1989:RBA, author = "Benny Chor and Cynthia Dwork", title = "Randomization in {Byzantine} Agreement", booktitle = "Advances in Computing Research 5: Randomness and Computation", publisher = "JAI Press", year = "1989", pages = "443--497", OPTnote = "A useful survey of the myriad of randomized distributed algorithms for Byzantine agreement.", annote = "[to read]" } @Article{Cristian:1989:PCS, author = {Flaviu Cristian}, title = {Probabilistic clock synchronization}, journal = j-DC, year = {1989}, OPTkey = {}, volume = {3}, OPTnumber = {}, pages = {146--158}, OPTmonth = {}, OPTnote = {}, annote = {A very well-written introduction into clock synchronization in ``real'' systems, and a good starting point for a lecture on this topic. Assumes that there is a maximum drift rate but there is no maximum message delivery delay. In this setting, clock synchronization can only be achieved in a probabilistic manner and Cristian well explains the inherent tradeoffs. Mentions that modern quartz clocks have a drift rate of the order $10^{-6}$, messages have some minimum time to travel but the distribution of delivery times (while usually being close to the minimum) is arbitrary. Nodes and messages can only suffer performance failures. In a nice exposition, it is explained how a node reads another node's clock and within what bounds the reading is as well as the error. (The precision of the reading is better the shorter the round trip time of the reading was.) Fixing the error results in a maximum time which a node is willing to wait for a result. There is a fundamental trafe-off between the precision of the reading and the probability of success. Other algorithms like \cite{Srikanth:1987:OCS,Dolev:1995:DFC} are deterministic, i.e. they always reach a result but have poor precision. There is a continuum of probabilistic algothms between the bounds of setting the maximum waiting time. Setting it close to the minimum is ``aggressive'' and will get good results with low probability. The other extreme are deterministic algorithms. Cristian also sketches implementations of time services and gives real-world numbers to instantiate the formulas given. It shows that synchronization within milliseconds is achievable. Overall, one of my top-ten favourite papers. } } @InProceedings{Gopal:1989:RBS, author = "A. Gopal and S. Toueg", title = "Reliable Broadcast in Synchronous and Asynchronous Environments", booktitle = pro-wdag89, address = "Nice, France", year = "1989", pages = "110--123", annote = "[to read]" } @InProceedings{Gray:1989:LEF, author = "Cary G. Gray and David R. Cheriton", title = "Leases: {An} efficient fault-tolerant mechanism for distributed file cache consistency", booktitle = "Proceedings of the 12th ACM Symposium on Operating System Principles", conflocation = "Litchfield Park, AZ, 3--6 December 1989", journal = "Operating Systems Review", volume = "23", number = "5", year = "1989", month = dec, pages = "202--10", key = "Gray89", keywords = "Gray89 time-based distributed coherency, distributed file sytems, V performance, lease", abstract = "Caching introduces the overhead and complexity of ensuring consistency, reducing some of its performance benefits. In a distributed system, caching must deal with the additional complications of communication and host failures. {\em Leases} are proposed as a time-based mechanism that provides efficient consistent access to cached data in distributed systems. Non-Byzantine failures affect performance, not correctness, with their effect minimized by short leases. An analytic model and an evaluation for file access in the V system show that leases of short duration provide good performance. The impact of leases on performance grows more significant in systems of larger scale and higher processor performance.", annote = "A lease is a contract that gives its holder specified rights over an object for a limited period of time. In the case where file cache consistency is to be maintained, a cache must obtain a lease for an object when the application accesses that object. A lease implicitly contains a lease term (duration) which describes its validity over time. Only with a valid lease a cache is allowed to answer read requests for that object. If the cache is requested to update an object, the cache must obtain a lease (if it doesn't have one already) and must then obtain approval by all other leaseholders for the write. When granting approval, leaseholders give up their lease. Here, fault tolerance comes into play: a client wanting to update an object must wait either until it has an approval of all leaseholders or until all of their leases have expired. (To prevent starvation, no new leases for an object are granted during this waiting time.) This can effectively help combat non-Byzantine faults in the system. Leases can introduce \emph{false sharing}, i.e. lease conflicts where no actual write conflicts exist, for example if another client cache has obtained a lease but has stopped using the object long before the lease has expired. For this, short lease terms are good. Short lease terms also minimize the delay caused by network partitions and client crashes (this is analogous to short aggressive time-outs in failure detection). Long term leases have the advantage if objects are accesses repeatedly by the same client and there is little write sharing. Analytical and experimental results are presented, stating that lease terms of 5--10 seconds in the V system are quite good, based on read and write rates between 0.03 (writes) and 0.8 (reads) per second, message propagation of 1 msec, message processing time of 0.25 msec and maximum clock skew of 100 msec. These simulations however do not refer to fault tolerance issues. The leases mechanism is dependent on synchronized clocks. A minimum assumption is that clocks have a known bounded drift rate. In this case, leases can be simply communicated using their duration. Server clocks that advance too quickly and client clocks which are too slow are problematic and can cause errors while the opposite (e.g. slow server clocks etc.) simply cause more message traffic. The conclusions contain a good cite which is in the spirit of Cristian and Fetzer's timed model \cite{Cristian:1999:TAD}: ``The lease approach is an example of a communication and coordination mechanism and reasoning based on (real) time, the availability of clocks that measure the passage of time with modest accuracy, and the ability to draw conclusions after a passage of time, possibly in the absence of communication. [...] We see this use of time as a fundamental aspect of distributed systems with potential for significant extension beyond that described here.'' " } @Article{Halpern:1989:MKA, author = {Joseph Y. Halpern and Ronald Fagin}, title = {Modellung knowledge and action in distributed systems}, journal = j-DC, year = {1989}, OPTkey = {}, volume = {3}, OPTnumber = {}, OPTmonth = {}, pages = {159--177}, OPTnote = {}, OPTannote = "[to do]" } @Article{Lamport:1989:SAS, author = {Leslie Lamport}, title = {A simple approach to specifying concurrent systems}, journal = j-CACM, year = {1989}, OPTkey = {}, volume = {32}, number = {1}, pages = {32--45}, month = jan, OPTnote = {}, annote = {An amusing but still challenging paper on formal specifications of concurrent programs. Lamport informally presents the ``transition axiom method'' which is described in detail in \cite{Lamport:1983:SCP}. A system is a `thing' that interacts with its environment through a well-defined interface. The system properties in question are described as safety and liveness, which capture the essence of system behavior relevant to the author. (There are system properties not expressible as safety and liveness, some are given, confer also \cite{Rushby:1994:CSP}.) Safety properties are discussed first: A simple soda vending machine with three (specification) states and four (specification) state transitions is taken as an example. The essence of Lamports specification method is to say which state transitions are allowed and which ones aren't. A system may have some unspecified state set $S$, and a specification can be viewed as a restriction on some state function $f$ from $S$ to the set of specification states. The machines behavior is a sequence of states $s_0,s_1,\ldots$ from $S$. A programmer wishing to implement the specification must find such a state function $f$ which changes its state according to the specification and some interface actions. Finding such a function is like proving that the implementation is correct regarding the specification. A specification must also always contain a description of the interface of the system in question. This description is naturally at an implementation level. The formula underlying a transition axiom specification is a temporal logic formula of the form $\exists f_1,\ldots,f_n$ for which $X(f_1,\ldots,f_n,g_1,\ldots,g_m)$. Here, $f_i$ are internal state functions and $g_i$ are state functions of the interface. The existential quantification over $f_i$ signifies the freedom of implementation. The fact that the $g_i$ are free variables means that they must appear in the implementation (i.e., are in fact implementation level). The internal states which are implied by a transition axiom specification constrain the implementation a bit; formalisms that do not constrain the implementation (like pure temporal logic) are however not more general that transition axioms. In fact, sometimes it's good to give some hints to an implementation. (However, a specification still should concentrate only on the externally visible behavior. Mechanisms not using additional state variables tend to be very complex.) The approach to write specifications then is to (1) choose a set of states (and thus state functions), (2) specify how they are allowed to change (these are the transition axioms), and (3) specify when they must change. Transition axioms are safety requirements, part 3 specifies liveness requirements. Liveness requirements are written in temporal logik. A specification can be separated into safety and liveness, thus separating the transition axioms from the temporal logik part. Showing that an implementation satisfies a specification, one shows that the system's safety implies the safety specification and then that the system's safety and liveness imply the liveness specification. The system's safety and liveness are given by the implementation, which is a kind of lower level specification. The paper is written in a question/answer style which is very amusing. A rewarding paper.} } @InProceedings{Mattern:1989:VTG, author = "Friedemann Mattern", title = "Virtual time and global states of distributed systems", booktitle = "Proceedings of the International Workshop on Parallel and Distributed Algorithms", editor = "M. Cosnard et al.", publisher = "Elsevier Science Publishers", address = "Chateau de Bonas, France", year = "1989", pages = "215--226", note = "Reprinted on pages 123--133 in \cite{Yang:1994:GST}.", annote = "Classic on vector time, consistent global states etc." } @PhdThesis{Michel:1989:KDB, author = "Ruben Michel", title = "Knowledge in distributed {B}yzantine environments", school = "Yale University", year = "1989", annote = "requested from yale tech reports" } @Misc{Mills:1989:MPN, OPTkey = {}, author = {David L. Mills}, title = {Measured performance of the Network Time Protocol in the Internet system}, howpublished = {Internet Request for Comments RFC 1128}, year = {1989}, month = oct, OPTnote = {}, OPTannote = {to read} } @Article{Rabin:1989:EDI, author = {Michael O. Rabin}, title = {Efficient dispersal of information for security, load balancing, and fault tolerance}, journal = {Journal of the ACM}, year = {1989}, OPTkey = {}, volume = {36}, number = {2}, pages = {335--348}, OPTmonth = apr, OPTnote = {}, annote = {This is maybe a relation between security, fault tolerance and redundancy? Uses a scheme of information sharing to make information secure and available.} } @Book{Rao:1989:ECC, author = {T. R. N. Rao and E. Fujiwara}, title = {Error-control coding for computer systems}, publisher = {Prentice-Hall}, year = {1989}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {My standard cite for error control and detection codes.} } @Article{Venkatesan:1989:RPD, author = {Subbarayan Venkatesan}, title = {Reliable protocols for distributed termination detection}, journal = {IEEE Transactions on Reliability}, year = 1989, OPTkey = {}, volume = 38, number = 1, pages = {103--110}, month = apr, OPTnote = {}, annote = {Venky's Homepage: \url{http://www.utdallas.edu/~venky/} The paper looks at distributed termination detection in asynchronous systems with crash failures. It assumes that with $k$ failures the network stays connected and that channels are FIFO. States that termination detection is at least as hard as consensus and thus impossible it the given context, so it assumes the what we today call a perfect failure detector. The presented protocol is based on a termination detection scheme built for fault free systems. If there can be $k$ failures, the protocol elects $k+1$ leaders which replicate the state information of all other nodes. In case of a failure, the termination detection protocol is aborted and a new round is started. In this round, the leaders simulate the behavior of the crashed nodes. Refernces a fault tolerant snapshot protocol by Shah and Toueg which seems to be only available as a Cornell TR \cite{Shah:1984:DSS}.} } @InProceedings{Weber:1989:FSF, author = "D. G. Weber", title = "Formal Specification of Fault-Tolerance ad Its Relation to Computer Security", pages = "273--277", ISBN = "0-8186-1942-2", editor = "Sol Greenspan", booktitle = "Proceedings of the 5th International Workshop on Software Specification and Design", address = "Pittsburgh, PA", month = may, year = "1989", publisher = "IEEE Computer Society Press", annote = "A neat and high level description of how fault-tolerance in its different forms can be specified at the system interface. A system is identified with its set of traces. A fault scenario is a precise description of how the components are doomed to fail (this is nowadays called the fault assumption). MTTF can be calculated by averaging over all fault scenarios. A system $D$ has a fault-tolerant version $FTD$, and let $N$ be a set of fault scenarios where no faults occur (fault-free fault assumption) and $C$ be a set of fault scenarios under which we desire fault tolerance. Proving fault tolerance can be now done in three ways: (1) show that the behavior of $D$ under $N$ is identical to the behavior of $FTD$ under $C$, (2) characterize the behavior of $D$ under $N$ by some specification $S$ and show that $FTD$ under $C$ implements $S$, or (3) show that the behavior of $FTD$ under $N$ is identical to the behavior of $FTD$ under $C$. The third method is taken as the basis for a definition of fault tolerance: A system is fault tolerant if for all its behaviors under $C$ there is an equivalent behavior under $N$. This definition can be weakened by redefining `equivalent' to mean `acceptably equivalent' regarding some equivalence relation on traces. This can be also model gracefull degradation (as is done in \cite{Herlihy:1991:SGD}). The author indicates that there are close resemblances to computer security specifications: highly sensitive events are analogous to faults as they should not show up on lower levels (i.e. to unauthorized users). Overall a short and concise paper, one of the earliest using this formal view, although I don't understand the differences between (1), (2) and (3) above. Referenced in \cite{Schepers:1993:TFT} as a similar approach as \cite{Joseph:1987:PRF} (expliziter Fehlermodellierung). Generally, security properties are probably higher oder properties cite{McLean:1994:GTC}. Not cited in \cite{Herlihy:1991:SGD}." } @Article{Ben-Or:1990:FPS, author = "Michael {Ben-Or} and Oded Goldreich and Silvio Micali and Ronald L. Rivest", title = "A Fair Protocol for Signing Contracts", number = "1", journal = "IEEE Transactions on Information Theory", volume = "36", pages = "40--46", year = "1990", month = jan, annote = "The authors present a neat fair exchange protocol which works as follows: two parties $A$ and $B$ exchange in rounds signed statements of the form ``with probability $p$ the agreed-upon contract is valid for me'' ($p$ is different for messages signed by $A$ or by $B$). Both parties start with $p=0$ and independently decide how to increase their $p$. In the effective case, eventually both will receive a statement of the form ``with probability 1 the contract is valid for me''. In the non-effective case, one party (say $A$) can turn to a judge and present to it the last message it received from $B$. The judge throws a dice and decides with probability $p_B$ whether the contract holds for both or not. If it holds, $B$ must obey the contract too. If it does not hold, the contract is refuted. The judge must be able to recollect every verdict (and thus usually store the value together with the contract [a method is given how this can be circumvented]). Overall, this is a very interesting and well-written paper keeping mathematics small. The protocol can be seen as a very general and clever gradual exchange protocol which can also be applied if the to be exchanged item is not infinitely splittable. It is optimistic since the judge is only needed in failed cases. The paper is also interesting since it reviews some fairness definitions regarding gradual exchange (computational vs. probabilistic) and thus comes close to the formalization of strong fairness of \cite{Gaertner:1999:AFD}. Also, an impossibility result concerning two-party exchange is cited cite{Even:1980:RAP} which is difficult to get, but relevant for \cite{Pagnia:1999:IFE}." } @Article{Biran:1990:CCD, title = "A Combinatorial Characterization of the Distributed 1-Solvable Tasks", author = "Ofer Biran and Shlomo Moran and Shmuel Zaks", pages = "420--440", journal = "Journal of Algorithms", year = "1990", month = sep, volume = "11", number = "3", annote = "[to read] extends \cite{Fischer:1985:IDC}." } @Article{Buerk:1990:VES, author = "Holger B{\"u}rk and Andreas Pfitzmann", title = "Value Exchange Systems Enabling Security and Unobservability", keywords = "digital money, TTP, payment, pseudonyms, ware, complaint period", journal = "Computers \& Security", year = "1990", annote = "[havent got a copy, annote written by someone else:] two approaches to overcome the problem of simultaneity in value exchange. both based on digital signatures (pseudonyms/one-show credentials) certified by TTP: 1.) passive TTP: - mutual authentication using pseudonyms X <-> Y - signing of agreement X <-> Y - money X -> Y - ware Y -> X, or complaint X -> TTP, TTP checks agreement, asks Y to deliver again and passes ware to X or identifies Y (-> court) 2.) active (intermediary) TTP: - X,Y,TTP make agreement (to protect from TTP) - money X -> TTP (as money can not be copied this has to be done before ware to protect from faulty TTP) - ware Y -> TTP (after receiving ``money-commit'') - money -> Y, ware -> X (after check of ware) abortion after money-transfer requires signed cancelation by TTP and/or prove of payment. how handling interactive/non-transferable payments. question of quality of service have to solved outside the system by a court. (good paper, not too formal)", number = "8", pages = "715--721", volume = "9", } @Article{Champine:1990:PAD, author = "George A. Champine and Daniel E. {Geer, Jr.} and William N. Ruh", title = "{Project Athena} as a Distributed Computer System", journal = j-IEEE-COMPUTER, volume = "23", number = "9", pages = "40--51", month = sep, year = "1990", abstract = "Now providing 10,000 students and faculty with a variety of network services, MIT's educational workstation system is designed to grow to 10 times its present size.", annote = "[to read]" } @InProceedings{Chaudhuri:1990:AHC, author = "Soma Chaudhuri", title = "Agreement is harder than consensus: set consensus problems in totally asynchronous systems", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", OPTpages = "311--324", booktitle = "Proceedings of Principles of Distributed Computing 1990", year = "1990", OPTorganization = "", OPTpublisher = "", OPTaddress = "", OPTmonth = "", OPTnote = "", annote = "The author investigates the boundary between possibility and impossibility of solutions to problems in asynchronous systems. The problems investigated are $k$-set consensus problems, where the agreed upon set of values has size at most $k$. It is shown that the $m$-resiliency is in relation to the size $k$ of the set. This is another paper exploring the border to impossibility after the FLP result \cite{Fischer:1985:IDC} in the lines of \cite{Attiya:1987:ACA,Dolev:1987:MSN} and \cite{Dwork:1988:CPP}. A subsequent version appeared in Information and Computation, 105 (1), 1993, pp. 132--158." } @Article{Dwork:1990:KCK, author = "Cynthia Dwork and Yoram Moses", title = "Knowledge and Common Knowledge in a {B}yzantine Environment: Crash Failures", journal = "Information and Computation", year = "1990", volume = "88", number = "2", pages = "156--186", topic = "epistemic-logic;mutual-belief;Byzantine-agreement;", annote = "[to read]" } @InCollection{Emerson:1990:TML, author = {E. Allen Emerson}, title = {Temporal and Modal Logic}, booktitle = {Handbook of Theoretical Computer Science}, OPTcrossref = {}, OPTkey = {}, pages = {997--1072}, publisher = {Elsevier}, year = {1990}, editor = {Jan van Leeuwen}, volume = {B}, OPTnumber = {}, OPTseries = {}, OPTtype = {}, chapter = {16}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {Brilliant introduction into the zoo of temporal logics.} } @InProceedings{Gopal:1990:SFB, author = "A. Gopal and S. Toueg", title = "On the Specification of Fault-Tolerant Broadcast", booktitle = "Proc. Int. Workshop on Future Trends of Distributed Computing Systems", pages = "54--56", publisher = "IEEE Computer Society Press", address = "Cairo, Egypt", year = "1990", annote = "[to read]" } @ARTICLE{Gouda:1990:SU, AUTHOR = "Mohamed G. Gouda and Ted Herman", TITLE = "Stabilizing unison", JOURNAL = j-IPL, VOLUME = 35, YEAR = 1990, PAGES = "171--175", annote = "A short paper in the lines of \cite{Arora:1991:MDC}." } @InProceedings{Gronning:1990:SDD, title = "Stepwise Development of a Distributed Load Balancing Algorithms", author = "Peter Gr{\o}nning and Thomas Qvist Nielsen and Hans Henrik L{\o}vengreen", booktitle = pro-wdag90, editor = "Jan van Leeuwen and Nicola Santoro", year = "1990", series = ser-LNCS, volume = "484", ISBN = "ISBN 3-540-54099-7", pages = "151--168", annote = "Abstract problem statement like in \cite{Arora:1995:ECC}. Formal definition of globally $k$-balanced, locally $k$-balanced. Resulting system is only locally balanced by simple local exchanges of one load unit at a time. No global balancing wanted, but a very broad sense of global balance achieved (depending on the diameter of the network). The abstract algorithm is transformed into a message passing environment. Explicit reference to self-stabilization of Dijkstra and hints to papers on stepwise development out of specifications." } @InProceedings{Halpern:1990:CEB, author = "Joseph Halpern and Yoram Moses and Orli Waarts", title = "A Characterization of Eventual Byzantine Agreement", pages = "333--346", ISBN = "0-89791-404-X", editor = "Cynthia Dwork", booktitle = "Proceedings of the 9th Annual {ACM} Symposium on Principles of Distribted Computing", address = "Qu{\'e}bec City, Qu{\'e}bec, Canada", month = aug, year = "1990", publisher = "ACM Press", annote = "The authors show that while common knowledge is sufficient to achieve simultaneous Byzantine agreement, eventual Byzantine agreement (EBA) is equivalent to achieving continual common knowledge, a variant of common knowledge. They give a brief introduction into the knowledge formalism, define and characterize continual common knowledge and show how to construct optimal EBA protocols with a certain technique. The fault class under consideration comprises omission and crash faults. The conclusions state that results can be extended to Byzantine faults, asynchronous systems and general coordination problems. Overall a very concise and brief-up-to-the-bare-minimum paper." } @Article{Halpern:1990:KCK, author = "Joseph Y. Halpern and Yoram Moses", title = "Knowledge and common knowledge in a distributed environment", OPTcrossref = "", OPTkey = "", journal = j-ACM, year = "1990", volume = "37", number = "3", pages = "549--587", month = jul, OPTnote = "", annote = "A brilliant paper on the role of ``knowledge'' in distributed systems. The authors define different notions of knowledge (as opposed to belief) and emphasize the small differences by amusing and instructive examples. The different notions of knowledge are: distributed knowledge of x of a group G (someone who knows everything that people in G know knows x), ``someone'' knowledge, ``everyone'' knowledge, and common knowledge. These form a strict hierarchy. Different forms of knowledge can be ascribed to processors, the most common being view-based knowledge. View-based knowledge of processor p of a fact f means that f is true in all points that are indistinguishable by p. Normally, view-based knowledge bases on the state (or state history) of a node. Common knowledge is the strongest form and the authors show that it is at the core to a lot of important problems in distributed systems (for example agreement). The use the coordinated attack problem (or two way handshake) to show, that common knowledge is not attainable in systems with unreliable (or completely asynchronous) message passing and without a global clock. In general, such communication cannot be used to attain common knowledge. This is a direct connection to the impossibility of consensus in asynchronous systems \cite{Fischer:1985:IDC}. In practice, many problems can only be solved because the do not require common knowledge. But also: There are certain weaker kinds of common knowledge that are attainable. The first is epsilon-common knowledge, where the fact of sending a message (and that all others receive it) will become common knowledge within epsilon time steps. (This is analogous to a synchronous broadcast.) The second is eventual common knowledge, where sending a message will eventually become common knowledge. (This corresponds to asynchronous but reliable communication.) Eventual common knowledge is weaker than epsilon common knowledge. Things that can not be attained using reliable communication cannot be attained too if communication is unreliable. Finally, the notion of timestamped common knowledge is discussed (``at time t on p's clock p knows something''). Timestamped common knowledge is aparent in many protocols that operate in rounds. At the end, the notion of virtual synchrony is connected to the notion of knowledge consistency, where nodes may actually not have common knowledge, but nothing they see violates this assumption. The conclusions contain hints to other research in the field. Overall, this is a paper with a huge potential that seemingly hasn't been followed in recent years." } @Article{Jalote:1990:FRW, author = "P. Jalote and S. K. Tripathi", title = "Final Report on Workshop on Integrated Approach for Fault Tolerance - Current State and Future Requirements", journal = "ACM Operating Systems Review", volume = "24", number = "1", pages = "40--57", year = "1990", annote = "[to read]" } @InProceedings{Jayanti:1990:WUR, title = "Wakeup under Read/Write Atomicity", author = "Prasad Jayanti and Sam Toueg", booktitle = "Distributed Algorithms, 4th International Workshop", editor = "Jan van Leeuwen and Nicola Santoro", address = "Bari, Italy", month = "24--26~" # sep, year = "1990", series = "Lecture Notes in Computer Science", volume = "486", publisher = pub-SV, ISBN = "ISBN 3-540-54099-7", pages = "277--288", annote = "Ted says that here's a possible relation between self-stabilization and unreliable failure detection. Have to get it." } @Book{Krumm:1990:FAK, author = {Heiko Krumm}, title = {{Funktionale Analyse von Kommunikationsprotokollen}}, publisher = pub-SV, year = {1990}, OPTkey = {}, OPTvolume = {}, number = {247}, series = {Informatik-Fachberichte}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {Krumm entwirft ein allgemeines Modell zur Beschreibung funktionaler Aspekte von Kommunikationsprotokollen kooperierender Systeme und gibt einen verdienstvollen Ueberblick ueber die existierenden Spezifikationstechniken und deren Zusammenhaenge untereinander. Die Grundbegriffe des Modells sind System, Kopplung und Instanz. Ein System besteht aus einer Menge von Instanzen, die intern ueber eine Kopplung kommunizieren. Das System selbst hat im Falle eines offenen Systems eine Schnittstelle zu einer Systemumwelt, und da Instanzen selbst wieder Systeme im Kleinen sind, haben Instanzen auch eine Instanz-Schnittstelle. Diese statische Grundstruktur erlaubt eine hierarchische Systemdefinition. Systeme bzw. Instanzen koennen intern betrachtet werden (d.h. ihr innerer Aufbau inklusive Subsystemen und Kopplung) oder extern (d.h. nur anhand ihres Verhaltens an der Schnittstelle). Eine Schnittstelle ist eine Menge von Ereignissen. Das Kommunikationsverhalten an einer Schnittstelle ist ein Baum, dessen Kanten mit derartigen Ereignissen bezeichnet ist. Kommunikationsverhalten abstrahiert von internem Verhalten eines Systems. Durch Betrachtung des Kommunikationsverhaltens ist es moeglich, Instanzen bezueglich ihres Verhaltens zu vergleichen. Kommunikation wird modelliert durch eine Kopplung, die selbst ein System ohne interne Zustaende ist und atomar beliebig viele Ereignisse (=Nachrichten) an der Schnittstelle empfangen und Versenden kann. Kopplungen arbeiten nach dem Uebereinkunftsprinzip (synchron) oder nach dem Uebertragungsprinzip (asynchron). Systeme selbst werden auf der Basis von Zustaenden und Zustandsuebergaengen definiert. Aus dieser Definition entspricht der Menge aller Systemablaeufe ein Erreichbarkeitsgraph, der eine endliche Repraesentation eines prinzipiell unendlichen Verhaltens ist. Das Schnittstellenverhalten (Kommunikationsverhalten) ist ein Baum, der mit dem internen Systemablauf vertraeglich ist. Anschliessend wird auf die Begriffe Dienst und Protokoll eingegangen. Ein Protokoll ist ein internes Ablaufverhalten eines Systems, welches vom globalen Verhalten abstrahiert und nur die vom Protokoll reglementierten Kommunikationsbeziehungen betrachtet. Ein Dienst ist eine Instanz, die an ihrer Schnittstelle ein gewisses Verhalten (mit gewissen Ereignissen) garantiert/anbietet. Protokolle sind darum horizontale Kommunikationsbeziehungen waehrend Dienste vertikale Beziehungen darstellen (bezogen auf die gebraeuchliche Darstellung des ISO/OSI-Protokollstacks). Im folgenden Kapitel werden die gaengigen Analysemassnahmen angesprochen (von informalen Ueberpruefungen bis formalen Korrektheitsnachweisen) und der Begriff der Eigenschaft eines Protokolls definiert. Anschliessend werden gaengige Spezifikationstechniken klassifiziert nach Spezifikationsform (konstruktiv = spezifiziere Schnittstellenverhalten durch internes Verhalten, deskriptiv = spezifiziere Schnittstellenverhalten durch direkte Verhaltensbeschreibung an der Schnittstelle). Konstruktive Techniken koennen direkt (konkreter Automat angegeben) oder algebraisch (eine gewisse Abstraktion von internem Automatenverhalten) sein. Deskriptive Techniken koennen logisch (aufbauend auf einem (temporal-)logischen Kalkuel) oder auf Zusicherungen aufbauen. Letztere koennen allerdings nur Sicherheitseigenschaften verifizieren. Beispiele fuer die einzelnen Spezifikationsformen werden gegeben (Petri Netze, Milners CCS, erweiterte endliche Automaten). Das Buch ist insgesamt sehr gut lesbar und auch fuer Einsteiger in das Gebiet durchaus geeignet, vor allem, weil es auf Deutsch ist.} } @INCOLLECTION{Lamport:1990:DCM, AUTHOR = "Leslie Lamport and Nancy Lynch", TITLE = "Distributed computing: models and methods", BOOKTITLE = "Handbook of Theoretical Computer Science (Volume B: Formal Models and Semantics)", PUBLISHER = "Elsevier", YEAR = 1990, CHAPTER = 18, PAGES = "1157--1199", NOTE = "J. van Leeuwen, Editor", annote = "" } @Book{Lee:1990:FTP, author = "Peter A. Lee and Thomas Anderson", title = "Fault Tolerance: Principles and Practice", series = "Dependable computing and fault-tolerant systems", publisher = pub-SV, address = "Berlin ; New York", year = "1990", edition = "Second", annote = "[to read]", } @Article{Leveson:1990:USC, author = "Nancy G. Leveson and Stephen S. Cha and John C. Knight and Timothy J. Shimeall", title = "The Use of Self Checks and Voting in Software Error Detection: An Empirical Study", journal = "IEEE Transactions on Software Engineering", volume = "16", number = "4", pages = "432--443", year = "1990", abstract = "This paper presents the results of an empirical study of software error detection using self checks and N-version voting. A total of 24 graduate students in computer science at the University of Virginia and the University of California, Irvine, were hired as programmers. Working independently, each first prepared a set of self checks to an existing implementation of that specification. The modified programs were executed to measure the error-detection performance of the checks and to compare this with error detection using simple voting among multiple versions. The goal of this study was to learn more about the effectiveness of such checks. The analysis of the checks revealed that there are great differences in the ability of individual programmers to design effective checks. We found that some checks which might have been effective failed to detect an error because they were badly placed, and there were numerous instances of checks signaling nonexistent errors. In general, specification-based checks alone were not as effective as combining them with code-based checks. using self checks, faults were identified that had not been detected previously by voting 28 versions of the program over a million randomly-generated inputs. This appeared to result from the fact that the self checks could examine the internal state of the executing program whereas voting examines only the final results of computations. If internal states had to be consistent in N-version voting systems, then there would be no reason to write multiple versions. The programs were executed on 100 000 new randomly-generated input cases in order to compare error detection by self checks and by 2-version and 3-version voting. Both self checks and voting techniques led to the identification of the same number of faults for this input, although the identified faults were not the same. Furthermore, whereas the self checks were always effective at detecting an error caused by a particular fault (if they ever did), N-version voting triples and pairs were only partially effective at detecting the failures caused by particular faults. Finally, checking the internal state with self checks also resulted in finding faults that did not cause failures for the particular input case executed. This has important implications for the use of back-to-back testing.", note = "29 refs", } @Article{Mullender:1990:ADO, author = "Sape J. Mullender and Guido {van Rossum} and Andrew S. Tanenbaum and Robbert {van Renesse} and Hans {van Staveren}", title = "{Amoeba}: {A} Distributed Operating System for the 1990s", journal = j-IEEE-COMPUTER, volume = "23", number = "5", pages = "44--53", month = may, year = "1990", abstract = "Amoeba is the distributed system developed at the Free University (VU) and the Centre for Mathematics and Computer Science (CWI), both in Amsterdam. Throughout the project's ten-year history, a major concern of the designers has been to combine the research themes of distributed systems, such as high availability, use of parallelism and scalability, with simplicity and high performance. Distributed systems are necessarily more complicated than centralized systems, so they have a tendency to be much slower. Amoeba was always designed to be used, so it was deemed essential to achieve extremely high performance. The Amoeba software is based on objects. An objects is a piece of data on which well-defined operations may be performed by authorized users, independent of where the user and object are located. Objects are managed by server processes and named using capabilities chosen randomly from a sparse name space. Processes consist of a segmented address space shared by one or more threads of control. Processes can be created, managed, and debugged remotely. Operations on objects are implemented using remote procedure calls. Amoeba has a unique and fast file system. The file system is split into two parts --- the Bullet Service, which stores immutable files contiguously on the disk and the SOAP Directory Service, which provides a mechanism for giving capabilities symbolic names. The directory server also handles replication and atomicity, eliminating the need for a separate transaction management system.", annote = "[to read]" } @Article{Neiger:1990:AIF, author = "Gil Neiger and Sam Toueg", title = "Automatically Increasing the Fault-Tolerance of Distributed Algorithms", journal = "Journal of Algorithms", year = "1990", volume = "11", number = "3", pages = "374--419", annote = "Say you have designed a distributed algorithm in a synchronous (round based) system that tolerates crash failures using reliable communication. Can you mechanically derive a protocol which does the same thing and also tolerates send-omission, general omission or arbitrary failures? Yes you can, and Neiger and Toueg show you how to do it. The authors define a so-called translation, i.e. a function $T$ that converts a protocol $P_b$ to a protocol $P_s$. $P_b$ is correct when running in a system subject to a ``benign'' failure model $b$, and $P_s=T(P_b)$ is supposed to be correct in a system subject to a more severe failure model $s$. Correctness means that $P_s$ has the same set of histories as $P_b$ when you inspect only that part of the state which also exists in $P_b$. Also, only the states after a fixed numer $c$ are inspected (i.e. they speak of a $c$-phase simulation). Formally, a translation from a system $S_b$ to a system $S_s$ is given by a history simulation function $H$ with the following properties: (a) $H$ maps histories of a protocol running in $S_s$ to histories in $S_b$ and these histories are valid. (b) $H$ preserves the correctness of processors, i.e. a processor correct in $S_s$ is also correct in $S_b$ (not necessarily vice versa), (c) the states from a history in $S_b$ appear in steps of $c$ in the history of $S_s$. This refers to the states of all processors (this must be weakened when investigating translations to the byzantine failure model; there they only refer to the state of the correct processors). A translated protocol solves some problem if its translated histories solve the original problem specification. The authors continue to present translations from crash to send-omission and then from crash to general-omission. The idea is to insert additional rounds of communication and let processors which do a general omission crash themselves. Because the number of faulty processors in both systems is $t$, such a translation is possible. When dealing with arbitrary failures, the properties of the translation function are weakened (see above). Translations are presented which use a validated reliable broadcast primitive to be able to detect byzantine behavior and pretend that the bad processes crashed. Some lower bounds are proved as well. Overall a well-readable paper despite the formalisms and the proofs. It is interesting how the original correctness specifications are transformed into systems with a more severe failure model: with crash the specification stays the same (since we are in a synchronous environment this is possible \cite{Gaertner:1999:ESD}), with Byzantine we restrict the correctness to the set of correct processes. Are there intermediate steps?" } @Article{Nelson:1990:FTC, author = "Victor P. Nelson", title = "Fault-tolerant computing: fundamental concepts", OPTcrossref = "", OPTkey = "", journal = j-IEEE-COMPUTER, year = "1990", volume = "23", number = "7", pages = "19--25", month = jul, OPTnote = "", annote = "The author first defines the usual terms (fault, error, failure, fault classes, availability, dependability, reliability) and then reviews common elements of strategies in fault tolerance with focus on hardware. The elements are fault masking, fault detection, fault containment, diagnosis, repair/reconfiguration, recovery. He elaborates on error detection/masking/correction (using codes), self-checking logic, module replication, timing checks, fault containment, reconfiguration, repair and recovery. He only handles masking fault tolerance (indicating that safety is more important than liveness \cite{Kreitz:1998:SWL}). An insightfull paper where the ideas come from the obvious strive to organize the material more strictly. This is a task Nelson initiates, but has seemingly not aimed at." } @Article{Ramanathan:1990:FCS, author = "Parameswaran Ramanathan and Kang G. Shin and Ricky W. Butler", title = "Fault-Tolerant Clock Synchronization in Distributed Systems", journal = "Computer", volume = "23", number = "10", pages = "33--42", month = oct, year = "1990", abstract = "Software algorithms are suitable only where loose synchronization is acceptable, and hardware algorithms are expensive. A hybrid scheme achieves reasonably tight synchronization and is cost-effective.", keywords = "Computer Software--Applications; Computer Systems, Digital; Computers, Digital--Synchronization; Consistency algorithms; Convergence-averaging; Convergence-nonaveraging; Distributed; Distributed systems; Fault tolerant clock synchronisation; Fault-Tolerant Systems; Hardware synchronization algorithms; Hybrid synchronization; Probabilistic synchronization; Software synchronization algorithms; Synchronization Algorithms; Worst-case clock skews", annote = "[to read]" } @Book{Raynal:1990:SCD, author = "Michel Raynal and Jean-Michel Helary", title = "Synchronization and Control of Distributed Systems and Programs", series = "Wiley Series in Parallel Computing", pages = "124", publisher = "John Wiley \& Sons", address = "New York", year = "1990", keywords = "book, text,", abstract = "** Description ** The mastery of distributed applications demands a complete understanding of the foundations of the distributed algorithm. The object of this book is to present these foundations as they relate to synchronization--the key element of parallelism and distribution. Divided into four chapters, it explores the different types of synchronization that may be encountered in a parallel application and presents the concept of wave and several of its possible implementations. Synchronous and asynchronous sytems and their relationships are described, as well as the concept of the synchronization phase, its properties, and its use.\par ** Contents ** Different Forms of Synchronization between Processes. The Concept of a Wave, and Synchronization by Wave Sequence. Synchronization by Logic Pulsing. Synchronization by Phases. Appendices. References. Index.\par ** Market ** Engineers, Researchers, Professors and Students of Engineering.", note = "F-0-471-92453-9 1990cloth \$84.95", } @Article{Schneider:1990:IFS, author = "Fred B. Schneider", title = "Implementing fault-tolerant services using the state machine approach: {A} tutorial", OPTcrossref = "", OPTkey = "", journal = j-ACM-COMP-SURVEYS, year = 1990, volume = 22, number = 4, pages = "299--319", month = dec, OPTnote = "", annote = "The state machine approach enhances the fault-tolerance properties of a system by replicating nodes and coordinating the actions of these nodes (and the communication to and from them). The replica group thus acts as a single state machine, but now a certain number and kind of faults can be tolerated. This paper presents this approach and also discusses reconfiguration techniques. This is the paper to cite for the term ``state machine appoach''." } @Article{VanGasteren:1990:ANI, author = {A. J. M. van Gasteren and Gerard Tel}, title = {Comments on ``on the proof of a distributed algorithm'': always true is not invariant}, journal = ipl, year = 1990, OPTkey = {}, volume = 35, OPTnumber = {}, month = "September", pages = "277--279", OPTnote = {}, annote = "a paper which explains the intrinsic difference between the notions of ``invariant'' and ``always-true''. A predicate $P$ is an invariant (1) if $P$ holds in every initial state of a system, and (2) $P$ is not falsified by any action of the system. A predicate $P$ is always true if $P$ holds in every reachable state of the system. This means, an invariant is always true, but the converse is not valid. Example: a program which has one variable $k$ (initially 0) and one action ``if $k=1$ then $k:=2$''. Consider the predicate $P\equiv k<2$. Then $P$ is always true for the program, but $P$ is not an invariant, because the action does not maintain $P$. The authors argue that invariance is more useful because it is maintained by program composition." } @Article{Abadi:1991:ERM, author = {Mart{\'\i}n Abadi and Leslie Lamport}, title = {The Existence of Refinement Mappings}, journal = {Theoretical Computer Science}, year = {1991}, OPTkey = {}, volume = {82}, number = {2}, pages = {253--284}, month = may, OPTnote = {}, url = "http://www.research.digital.com/SRC/personal/Martin_Abadi/Papers/tcs.ps", annote = {Programs and specifications are viewed as formulas of the same logic (originally an idea of \cite{Pnueli:1981:TSC} explained in \cite{Abadi:1995:CS,Abadi:1993:CS,Lamport:1989:SAS}). The semantics of such a formula is the set of executions $\phi$ produced by that formula. A program $p_1$ implements $p_2$ if $\phi(p_1)$ implies $\phi(p_2)$. That $p_1$ implements $p_2$ can be shown by exhibiting a refinement mapping which relates the actions of $p_1$ to those of $p_2$. However, the validity of the implication does not guarantee that such a mapping exists. A refinement mapping between state spaces $S_1$ and $S_2$ can be used to prove that a state machine $\Sigma_1$ using states from $S_1$ implements a state machine $\Sigma_2$ using states from $S_2$. The main result of this paper is the following theorem: If $\Sigma_1$ implements $\Sigma_2$ then one can add history and prophecy variables to the specification of $\Sigma_1$ to find a refinement mapping from $S_1$ to $S_2$. The assumptions to prove this theorem are: (1) $S_1$ is machine closed, i.e. the ``liveness'' property of $\Sigma_1$ does not imply additional safety properties. (2) $\Sigma_2$ has finite invisible nondeterminism, i.e. external steps of $\Sigma_2$ must be finitely representable internally, and (3) $\Sigma_2$ is internally continuous, i.e. a not-allowed behavior can be determined by looking at the externally visible behavior plus only a finite part of the internal behavior. Other proved propositions are: any safety property has a specification with finite invisible nondeterminism, any safety property is internally continuous, and any property has a machine closed specification. The result shows that it is always possible to prove safety using refinement mappings, if not liveness.} } @Article{Arora:1991:MDC, author = "A. Arora and S. Dolev and M. Gouda", title = "Maintaining digital clocks in step", journal = "Parallel Processing Letters", volume = "1", number = "1", pages = "11--18", month = sep, year = "1991", keywords = "clocks; N-clock; simultaneously triggered clocks; stabilisation; stability; system", annote = "The authors present a design for achieving exact synchronization of bounded digital clocks in synchronous (i.e., lock-step) systems like digital circuits. The approach is an early example of applying the closure and convergence paradigm to problems, resulting in two self-stabilizing solutions: (1) a fall back solution, where a node periodically checks the clocks of its neighbours and falls back to a minumum value if values differ; and (2) a catch up solution where a maximum value is taken. The protocols are simple, uniform and distributed. The stabilization time is in the oder of the degree of the nodes times the diameter of the network. Overall, this is a paper unmistaken in the clarity and enjoyment of exposition and style, gladly to be read." } @INPROCEEDINGS{Awerbuch:1991:SLC, AUTHOR = "Baruch Awerbuch and Boaz Patt-Shamir and George Varghese", TITLE = "Self-stabilization by local checking and correction", BOOKTITLE = "FOCS91 Proceedings of the 31st Annual IEEE Symposium on Foundations of Computer Science", YEAR = 1991, PAGES = "268--277", annote = "[to write]" } @InProceedings{Chandra:1991:UFD, author = "Tushar Deepak Chandra and Sam Toueg", title = "Unreliable failure detectors for asynchronous systems", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "325--340", booktitle = pro-podc91, year = "1991", OPTorganization = "", OPTpublisher = "", OPTaddress = "", OPTmonth = "", OPTnote = "", annote = "The authors introduce the concept of a failure detector to battle the impossibility of consensus in asychronous systems. Failure detectors allow to make guesses on which computers are still alive and which are not in the network. They are classified and applied to consensus and atomic broadcast. See also the journal version of this paper \cite{Chandra:1996:UFD}. First reference to the concept." } @ARTICLE{Chen:1991:SAC, AUTHOR = "Nian-Shing Chen and Hwey-Pyng Yu and Shing-Tsaan Huang", TITLE = "A self-stabilizing algorithm for constructing spanning trees", JOURNAL = j-IPL, VOLUME = 39, YEAR = 1991, PAGES = "147--151" } @Article{Cooper:1991:CDG, author = "Robert Cooper and Keith Marzullo", title = "Consistent detection of global predicates", OPTcrossref = "", OPTkey = "", journal = j-SIGPLAN, year = "1991", volume = "26", number = "12", pages = "167--174", month = dec, OPTnote = "", annote = "Citable definition of possibly(P) and definitely(P)." } @Article{Cristian:1991:UFD, author = {Flaviu Cristian}, title = {Understanding fault-tolerant distributed systems}, journal = j-CACM, year = 1991, volume = 34, number = 2, month = feb, pages = "56--78", annote = "Describes traditional approach to fault-tolerant computing: failure models, failure semantics, fault-tolerance, architectural issues, standard systems, masking failures, hardware and software fault-tolerance." } @Book{Echtle:1990:F, author = {Klaus Echtle}, title = {Fehlertoleranzverfahren}, publisher = pub-SV, year = 1990, annote = "Echtle's well-known book on fault tolerance strategies. Ist in der Bib. Inf. vorhanden" } @InProceedings{Arora:1991:MDS, title = "Maintaining Digital Clocks In Step", author = "Anish Arora and Shlomi Dolev and Mohamed G. Gouda", booktitle = "Distributed Algorithms, 5th International Workshop", editor = "Sam Toueg and Paul G. Spirakis and Lefteris M. Kirousis", address = "Delphi, Greece", month = "7--9~" # oct, year = "1991", series = ser-LNCS, volume = "579", publisher = pub-SV, ISBN = "ISBN 3-540-55236-7", pages = "71--79", annote = "[to get]" } @InProceedings{Flatebo:1991:SLB, author = "Mitchell Flatebo and Ajoy Kumar Datta", title = "Self-stabilizing load balancing for an arbitrary network", OPTcrossref = "", OPTkey = "", editor = "J. Wu and W. Gao and J. Yang and Y. Li", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "743--746", booktitle = "ICYCS-93: Young Computer Scientists. Proceedings of the Third International Conference", year = "1993", OPTorganization = "", publisher = "Tsinghua University Press", address = "Beijing, China", month = "July", OPTnote = "", annote = "[who can get a hand on this?]" } @ARTICLE{Gouda:1991:AP, AUTHOR = "Mohamed G. Gouda and Ted Herman", TITLE = "Adaptive programming", JOURNAL = j-IEEE-TRANS-SOFTW-ENG, VOLUME = 17, NUMBER = 9, MONTH = sep, YEAR = 1991, PAGES = "911--921", annote = "Adaptive programs change their behaviour according to changes in their environment. Environment changes are assumed to be gradual and occur within relatively short periods of time compared to long periods of non-change. During periods of change an adaptive program behaves arbitrarily and eventually reaches a consistent behaviour if changes cease. The authors define adaptivity in terms of a `secures' relation: P secures Q in S means that if the environment establishes an input predicate P, then the program S will eventually reach a state where Q holds. A program is adaptive, if all properties of interest can be expressed using the secures relation. Thus, adaptivity is a general form of self-stabilization (which is ``true secures Q in S''). But in self-stabilization, legal states are usually defined in terms of internal variables. In adaptive programs there can be changes of the definition of legal states imposed by the envrionment." } @ARTICLE{Gouda:1991:SCP, AUTHOR = "Mohamed G. Gouda and Nicholas J. Multari", TITLE = "Stabilizing communication protocols", JOURNAL = j-IEEE-TRANS-COMP, VOLUME = 40, NUMBER = 4, MONTH = apr, YEAR = 1991, PAGES = "448--458", annote = "convergence stair presented" } @Article{Herlihy:1991:SGD, author = {Maurice P. Herlihy and Jeannette M. Wing}, title = {Specifying graceful degradation}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {1991}, OPTkey = {}, volume = {2}, number = {1}, pages = {93--104}, month = jan, OPTnote = {}, annote = {The authors show how the ideal specification of a program can be ``degraded'' in a structured way so that the behavior of the program is still ``close'' to the ideal specification if the environment (faults etc.) prohibits the ideal specification to be satisfied. Processes and the environment are modeled as finite state machines. State transitions of the processes are called operations, transitions of the environment are called events. The combined automaton produces executions (sequences of state/operation pairs). The ideal specificiation prescribes a certain set of executions assuming a certain state of the environment. The environment ensures some properties called constraints. Events cause these constraints to be violated, resulting in an `enlarged' behavior of the combined automaton. Depending on the set of constraints guaranteed by the environment, the combined automaton satisfies a weaker specification than the ideal specification. The constraints induce a lattice on the set of specifications of the automaton. This allows a designer to specify system behavior in the presence of violated constraints. (Cases where this can arise in practice are faults, timing violations or security breaches.) The method (called the lattice relaxion method) it makes environmental assumptions explicit and enables you to specify unwanted but sometimes unavoidable cases of system performance. Let's see how specifications can be systematically parametrized to yield such a lattice. [This paragraph was written in a state of partial sleep deliria; do not infer the quality of the paper from this text. In fact, the paper is very deep and interesting.] A similar idea is mentioned in \cite{Schepers:1993:TFT}. Does not cite \cite{Weber:1989:FSF}.} } @PhdThesis{Herman:1991:ATD, author = {Ted Herman}, title = {Adaptivity through distributed convergence}, school = {Department of Computer Science, University of Texas at Austin}, year = {1991}, OPTkey = {}, OPTaddress = {}, OPTtype = {}, OPTmonth = {}, OPTnote = {}, OPTannote = {nicht kopiert/ausgedruckt} } @InProceedings{Liskov:1991:PUS, author = "Barbara Liskov", title = "Practical Uses of Synchronized Clocks in Distributed Systems", pages = "1--10", ISBN = "0-89791-439-2", editor = "Luigi Logrippo", booktitle = pro-podc91, address = "Mont{\'e}al, Qu{\'e}bec, Canada", month = aug, year = "1991", publisher = pub-ACM, annote = "Discusses several uses of synchronized clocks in distributed algorithms: SCMP protocol (a protocol that achieves at-most-once semantics of messages), tickets in Kerberos, and several forms of leases \cite{Gray:1989:LEF} for maintaining replica consistency. The starting point to remeber is that assumptions about clock rates are always probabilistic and so assumptions about synchronization should only affect the performance not the correctness of a protocol. In general, time is used to achieve liveness, e.g. a server requests a replica to give up its lease; it waits either until the replica replies or its lease expires. The final paragraph contains some ideas on how to transform an algorithm not relying on synchronized clocks into more efficient versions using synchronized clocks: (1) identify messages which could be avoided using timestamps, (2) if message exchange is already reduced, find ways to save storage using timestamps (e.g. purge storage after $t$ seconds)." } @PhdThesis{Liu:1991:FTP, author = {Zhiming Liu}, title = {Fault-tolerant programming by transformations}, school = {University of Warwick, Department of Computer Science}, year = {1991}, OPTkey = {}, OPTtype = {}, OPTaddress = {}, OPTmonth = {}, OPTnote = {}, annote = {Published and extended in many forms \cite{Liu:1992:TPF,Liu:1993:SVR,Liu:1994:SDF,Liu:1995:FFF} and \cite{Liu:1996:VFR,Liu:1998:SVF} but seemingly the only reference to the term ``finite error behavior'' (p. 27).} } @InProceedings{Long:1991:SRI, author = {Darrel D. E. Long and John L. Carroll and C. J. Park}, title = {A study of the reliability of {Internet} sites}, booktitle = pro-srds91, OPTcrossref = {}, OPTkey = {}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, year = {1991}, OPTorganization = {}, OPTpublisher = {}, OPTaddress = {}, month = sep, pages = {177--186}, OPTnote = {}, annote = {to read} } @Book{Manna:1991:TLR, author = {Zohar Manna and Amir Pnueli}, title = {The temporal logic of reactive and concurrent systems: Specification}, publisher = pub-SV, year = {1991}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {See also \cite{Manna:1995:TVR}.} } @InProceedings{Marzullo:1991:DGS, author = "Keith Marzullo and Gil Neiger", title = "Detection of global state predicates", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "254--272", booktitle = pro-wdag91, year = "1991", OPTorganization = "", OPTpublisher = "", OPTaddress = "", OPTmonth = "", OPTnote = "", OPTannote = "" } @TechReport{Marzullo:1991:TCD, author = "Keith Marzullo and Mark D. Wood", title = "Tools for Constructing Distributed Reactive Systems", institution = "Dept.\ of Computer Science, Cornell University", year = "1991", number = "TR 91-1193", address = "Ithaca, New York ({USA})", month = feb, annote = "mentions sensor/actuator approach [to read]" } @Article{Oezveren:1991:SSD, title = "Stability and Stabilizability of Discrete Event Dynamic Systems", author = "C{\"u}neyt M. {\"O}zveren and Alan S. Willsky and Panos J. Antsaklis", area = "Theory of Computation", pages = "730--752", journal = j-ACM, month = jul, year = "1991", volume = "38", number = "3", general-terms = "Algorithms, Design, Languages, Reliability, Theory", keywords = "Reliability, self-stabilizing systems, stability, stabilizability, state feedback", cr-categories = "F.2.2 [computations on discrete structures \and sequencing and scheduling]; F.4.3 [algebraic language theory \and classes defined by grammars or automata]; G.2.2 [graph algorithms]; G.4 [algorithm analysis \and reliability and robustness]; H.2.8; J.7 [command and control \and process control]", annote = "[to read]" } @Article{Peleska:1991:DVF, author = "Jan Peleska", title = "Design and Verification of Fault Tolerant Systems with {CSP}", pages = "95--106", journal = j-DC, volume = "5", number = "2", year = "1991", publisher = pub-SV, annote = "A case study in proving a hot standby system correct using CSP. The proof method is like in CSP and proves refinements down several levels to the implementation. At some lower level, crash faults are introduced and masked by a redundant component together with a reconfiguration procedure. It seems as if faults and fault actions are modeled explicitly. Conversly to \cite{Peled:1994:CFF}, refinement steps are constructed by hand instead of using correctness preserving transformations (this is advocated as ``invent and verify'' which is claimed to suit industry). You need to know good CSP to really understand the text. If only parts of the system properties may be proved, this is noted to be something like graceful degradation." } @Article{Ralston:1991:FMH, author = {T. J. Ralston and S. L. Gerhart}, title = {Formal methods: {History}, practice, trends and prognosis}, journal = {American Programmer}, year = {1991}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTpages = {}, month = may, OPTnote = {}, annote = {[to get], cited in \cite{Glass:1999:RST} as the only study which has produced hard numbers on the benefit of applying formal methods in software engineering.} } @InProceedings{Sanders:1991:PTA, author = "Beverly Sanders", title = "A Predicate Transformer Approach to Knowledge and Knowledge-based Protocols", pages = "217--22", ISBN = "0-89791-439-2", editor = "Luigi Logrippo", booktitle = pro-podc91, address = "Mont{\'e}al, Qu{\'e}bec, Canada", month = aug, year = "1991", publisher = "ACM Press", annote = "[to read]" } @Article{Swade:1991:CCB, author = "D. Swade", title = "The construction of {Charles Babbage's} difference engine.", journal = "Annals of the History of Computing.", volume = "13", number = "1", pages = "82--83", year = "1991", keywords = "Babbage, difference engine", abstract = "Science Museum UK is building Babbage's difference engine (not his analytic engine which \~{} computer) to celebrate 200-th anniversary of Babbage's death (1771). 4000 parts, 3 tons, 10x6x1.5 feet Being built in materials and with accuracy of Babbage's day. The D.E. calculates 7th order polynomials to 30 decimal places.", annote = "Bowen \cite{Bowen:1993:SCS} cites another text by Swade towards the concerns of Charles Babbage about the `table crisis' which lead to the development of the difference engine." } @Book{Tel:1991:TDA, author = {Gerard Tel}, title = {Topic in Distributed Algorithms}, publisher = {Cambridge University Press}, year = {1991}, OPTkey = {}, OPTvolume = {}, number = {1}, series = {Cambridge International Series on Parallel Computation}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, OPTannote = {} } @PhdThesis{Arora:1992:FFC, author = {Anish Kumar Arora}, title = {A foundation of fault-tolerant computing}, school = {The University of Texas at Austin}, year = {1992}, OPTkey = {}, OPTaddress = {}, OPTtype = {}, month = dec, OPTnote = {}, annote = "Arora's thesis defines fault tolerance as the result of two conditions: closure and convergence. Closure means that a system remains in a set of legal states during normal system behaviour, convergence assures that any fault (modelled as actions on an extended state space \cite{Cristian:1985:RAF}) is eventually tolerated by returning into the set of legal states in finite time. This is a stabilizing notion of fault tolerance, published in an IEEE conference proceedings \cite{Arora:1993:CCF} and subsequently enhanced into a theory of correctors and detectors, a general theory of fault tolerance \cite{Arora:1998:CDM}." } @Article{Beauquier:1992:TDP, author = "Joffroy Beauquier", title = "Two distributed problems involving {Byzantine} processes", journal = "Theoretical Computer Science", volume = "95", number = "1", pages = "169--185", day = "23", month = mar, year = "1992", annote = "The author investigates two problems where processes are subject to Byzantine behavior: the naming problem (i.e., assigning unique names to processes) and the mutual exclusion problem. For these problems to be solvable, the author makes the following minimal assumptions: if k is a strict upper bound on the number of Byzantine processes, then the network must be k-connected, meaning that there are at least (k+1) disjoint paths between any two nodes. Also, communication must be synchronous and the algorithm must be non-uniform, i.e., there exists an exceptional node (the initiator) which is non-Byzantine. Access to public key cryptosystems is assumed. For mutual exclusion, the Byzantine processes may not hold the token arbitrarily long, i.e., their behavior is correct once they are in their critical section and during the exit sequence from it. This leads to the definition of locally Byzantine processes. Difficulties arise, because Byzantine nodes may not generally be detected. They may act normally when communicating to the outside world forever. Naming is achieved through a kind of echo algorithm that achieves safety through backwards confirmation over a given path. Through the k connectivity and communication synchrony it is assured that valid information eventually reaches a correct node and that this node can check the information. The mutual exclusion algorithm bases on Byzantine agreement. " } @InProceedings{Chandra:1992:WFD, author = "Tushar Deepak Chandra and Vassos Hadzilacos and Sam Toueg", title = "The Weakest Failure Detector for Solving Consensus", pages = "147--158", editor = "Maurice Herlihy", booktitle = pro-podc92, address = "Vancouver, BC, Canada", month = aug, year = "1992", publisher = "ACM Press", annote = "Continuing work \cite{Chandra:1991:UFD} shows that a certain form of failure detector is the weakest one necessary to solve consensus. See also journal version of this paper \cite{Chandra:1996:WFD} and other papers on this subject \cite{Chandra:1996:UFD,Chandra:1991:UFD}." } @Article{Hariri:1992:ASD, author = "Salim Hariri and Alok Choudhary and Behcet Sarikaya", title = "Architectural Support for Designing Fault-Tolerant Open Distributed Systems", journal = j-IEEE-COMPUTER, volume = "25", number = "6", pages = "50--62", month = jun, year = "1992", annote = "[to read]" } @TechReport{Heimerdinger:1992:CFS, author = {Walter L. Heimerdinger and Chuck B. Weinstock}, title = {A conceptual framework for system fault tolerance}, institution = {Software Engineering Institute}, year = {1992}, OPTkey = {}, OPTtype = {}, number = {CMU/SEI-92-TR-33}, address = {Carnegie Mellon University, Pittsburgh, PA}, month = oct, OPTnote = {}, annote = {A good introductory text to the traditional concepts and issues in fault-tolerant computing (in the lines of \cite{Laprie:1992:DBC}) targeted at egnineers and practicioners. Defines a system, dependability specifications (repeats the $10^{-9}$ reliability rate of commercial aircraft, states the problems with implicit and explicit specifications commented on by David Powell in Madeira), failure modes, faults vs. failures (fault is the failure of a subcomponent, avoids the term error), defines failure regions as oppsed to fault regions (vertical vs. horizontal perspective). Enumerates fault tolerance mechanisms (mainly redundancy management) and gives informal definitions of time and space redundancy, which are said to be necessary, not sufficient. The conclusions contain a set of 5 rules for the practitioner how to start off building reliable systems. Everything is underlined with running examples from bridge building and computer systems.} } @Article{Huang:1992:SSA, author = "Shing Tsaan Huang and Nian Shing Chen", title = "A self-stabilizing algorithm for constructing breadth-first trees", journal = j-IPK, volume = "41", number = "2", pages = "109--117", day = "14", month = feb, year = "1992", coden = "IFPLAT", ISSN = "0020-0190", mrclass = "68M15", mrnumber = "93a:68017", bibdate = "Wed Nov 11 12:16:26 MST 1998", acknowledgement = ack-nhfb, affiliation = "Natl Tsing Hua Univ", affiliationaddress = "HsinChu, Taiwan", classification = "723; 921; C1160 (Combinatorial mathematics); C4240 (Programming and algorithm theory)", corpsource = "Inst. of Comput. Sci., Nat. Tsing Hua Univ., HsinChu, Taiwan", journalabr = "Inf Process Lett", keywords = "algorithm theory; bounded function; Breadth First Trees; breadth-first trees; computation step; Computer Programming --- Algorithms; Fault Tolerant Software; Mathematical Techniques; rules; self-stabilizing algorithm; Self-Stabilizing Algorithms; Trees; trees (mathematics)", treatment = "T Theoretical or Mathematical", annote = "[to get] Difference to \cite{Chen:1991:SAC}?" } @Book{Isermann:1992:IDS, author = {Rolf Isermann}, ALTeditor = {}, title = {{Identifikation dynamischer Systeme}}, publisher = pub-SV, year = {1992}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Berlin}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {[Angabe von Armin]} } @Book{Laprie:1992:DBC, ALTauthor = {}, editor = {Jean-Claude Laprie}, title = {Dependability: {Basic} concepts and Terminology}, publisher = pub-SV, year = {1992}, OPTkey = {}, volume = {5}, OPTnumber = {}, series = {Dependable Computing and Fault-Tolerant Systems}, OPTaddress = pub-SV:adr, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {A joint attempt to unify dependability terminology and present it in 5 languages! Great! Maybe based on \cite{Laprie:1985:DCF}.} } @article{Liu:1992:TPF, author = {Zhiming Liu and Mathai Joseph}, title = {Transformation of Programs for Fault-tolerance }, journal = {Formal Aspects of Computing }, volume = { 4 }, number = { 5 }, year = { 1992 }, pages = { 442--469 }, annote = {``The task is then to develop programs which perform predictably in the presence of {\emph detected} system failures, and this requires the representation of such failures in the execution of a program.'' (p.443) This must be done at different levels of abstraction so it is good to use the same formalism for specifications and programs. The formalism used is close to TLA \cite{Lamport:1994:TLA} with its state based specification method \cite{Lamport:1989:SAS} and notion of refinement \cite{Abadi:1991:ERM}. Physical faults are modeled as actions that transform a good state into an error state which may lead to a violation of the specification. If such an action occurs, a boolean variable $f$ is set to true (modeling fault detection by underlying hardware). Faults cannot destruct program operations at the lowest level of abstraction. The fault affected version of a program $P$ is obtained by a transformation $F$ which is assumed to mimic fail-stop semantics \cite{Schlichting:1983:FSP}, i.e. once a fault action is executed, no further regular program actions occur. To make the fault affected version satisfy the original specification $S$, a fault-tolerant transformation $T$ must be applied such that $F(T(P))$ satisfies $S$. Usually, the transformed version will satisfy a weaker specification. $T$ is modeled as adding recovery actions which are only enabled when $f$ is true. It is assumed that recovery actions are not affected by faults. Overall $F(T(P))$ can then be shown to be equivalent to the parallel execution of program, fault and recovery actions. Fairness guarantees eventual recovery. At a step during the refinement process where there is sufficient information about the fault environment (such as the number of faulty processes/channels), then the recovery transformation can be devised. A specification language for action systems similar to \cite{Chandy:1988:PPD} and a notion of satisfiability between program and specification is devised. (Section 4:) The failure semantics of a program $P$ regarding a fault set $F$ are the set of executions of $P$ augmented by possible executions of actions from $F$. As $F$ is fail-stop, this results in a sequence of good states followed by an empty or infinite sequence of bad states. A fault transformation is defined which changes every command construct to result in the failure semantics of the program. With this formal definition of the transformation it is actually proved that $F(P)$ is parallel execution of $P$ and fault program. Fault transformations are transferred to sets of processes. Section 5 defines consistency and recovery transformations, the latter in analogy to fault transformations. Section 6 defines fault refinement, proves some properties of it and recovery transformations, and also proves some useful rules when refining programs to make them fault tolerant. A protocol for reliable communication is used as an example for the method. Interestingly, a variable $b$ is used in the fault program which guarantees the finiteness of consecutive faults. Overall, safety and progress properties can now be proved. The discussion (Sect. 8) states that the highest level of fault environment is the transition of $f$ from false to true. The next level action system is then an action which assigns $f$ the value true. Subsequent refinement steps must introduce more information about the system and its faults as the levels on which the faults occur are reached. It is an open question whether for any program and any fault model the value of $f$ can be derived at some point during the refinement process?! Handling real-time is an open question and is handled in later papers \cite{Liu:1996:VFR}. The idea of modeling faults as actions is attributed to \cite{Cristian:1985:RAF}. Overall, the paper is concise and rigorously formal, so at first reading many of the ideas and not readily visible. I had to read it twice, and after second reading like this text very much.} } @Misc{Mills:1992:NTP, OPTkey = {}, author = {David L. Mills}, title = {Network Time Protocol (Version 3)}, howpublished = {Internet Request for Comments RFC 1305}, year = {1992}, month = mar, OPTnote = {}, OPTannote = {to get} } @PhdThesis{Nordahl:1992:SDD, author = {Jens Nordahl}, title = {Specification and Design of Dependable Communicating Systems}, school = {Department of Computer Science, Technical University of Denmark}, year = {1992}, OPTkey = {}, OPTtype = {}, OPTaddress = {}, OPTmonth = {}, OPTnote = {}, OPTannote = {[to get] coins the terms local and global fault assumptions discussed in \cite{Liu:1995:FFF}.} } @Article{Ostroff:1992:FMS, author = "Jonathan S. Ostroff", title = "{Survey of Formal Methods for the Specification and Design of Real-Time Systems}", journal = "Journal of Systems and Software", volume = "18", number = "2", pages = "33--60", month = apr, year = "1992", annote = "An extensive survey of real-time programming languages, visual modelling languages, and most notably logics and algebras for specifying and verifying real-time systems. Real-time programming languages mostly only have delay and timeout mechanisms but lack formal semantics. Petri Nets are graphical modeling languages. An overview explains the different time semantics (linear, branching) and real-time temporal logics. Contains 144 references." } @InProceedings{Powell:1992:FMA, author = "David Powell", title = "Failure Mode Assumptions and Assumption Coverage", pages = "386--395", ISBN = "0-8186-2875-8", editor = "Dhiraj K. Pradhan", booktitle = "Proceedings of the 22nd Annual International Symposium on Fault-Tolerant Computing ({FTCS} '92)", address = "Boston, MA", month = jul, year = "1992", publisher = "IEEE Computer Society Press", annote = "[to read]" } @InProceedings{Rushby:1992:FSV, author = "John Rushby", editor = "J. Vytopil", title = "Formal Specification and Verification of a Fault-Masking and Transient-Recovery Model for Digital Flight-Control Systems", booktitle = "Formal Techniques in Real-Time and Fault-Tolerant Systems 2nd International Symposium", series = ser-LNCS, volume = "571", pages = "237--258", publisher = pub-SV, address = "Nijmegen, The Netherlands", year = "1992", annote = "[to read] appears also under the same name in a book of the same name published by the same editor in 1993 (Kluwer).]" } @Book{Siewiorek:1992:RCS, author = "Daniel Siewiorek and Robert Swarz", title = "Reliable Computer Systems: Design and Evaluation", publisher = "Digital Press", year = "1992", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", OPTaddress = "", OPTedition = "", OPTmonth = "", OPTnote = "", OPTannote = "[get it?]" } @InProceedings{Shankar:1992:MVG, title = "Mechanical Verification of a Generalized Protocol for {Byzantine} Fault-Tolerant Clock Synchronization", author = "Natarajan Shankar", booktitle = "Formal Techniques in Real-Time and Fault-Tolerant Systems", editor = "J. Vytopil", pages = "217--236", publisher = pub-SV, series = "Lecture Notes in Computer Science", volume = "571", month = jan, year = "1992", address = "Nijmegen, The Netherlands", annote = "[to read]" } @Article{Turek:1992:MFC, author = {John Turek and Dennis Shasha}, title = {The many faces of consensus in distributed systems}, journal = j-IEEE-COMPUTER, year = {1992}, volume = {25}, number = {6}, month = jun, pages = {8--17}, OPTnote = {}, annote = "A rewarding (because well written) paper on the different shades of (im)possibility of consensus in distributed systems. Starts with the parable of La Tryste, notes general settings in which consensus is (im)possible in message passing systems (synchrony of processors, message order, communication delay, transmission method), relates results to shared memory settings, sketches Fischer, Lynch and Pattersons result \cite{Fischer:1985:IDC}, proves impossibility of Byzantine agreement in message passing settings without signatures. Concludes: Global knowledge is much stronger than local knowledge." } @Article{Zhao:1992:SAB, title = "A Self-Adjusting Algorithm for {Byzantine} Agreement", author = "Yi Zhao and Farokh B. Bastani", journal = j-DC, pages = "219--226", year = "1992", volume = "5", number = "4", annote = "" } @Article{Abadi:1993:CS, author = {Mart{\'\i}n Abadi and Leslie Lamport}, title = {Composing Specifications}, journal = j-TOPLAS, year = {1993}, OPTkey = {}, volume = {15}, number = {1}, pages = {73--132}, month = jan, OPTnote = {}, annote = {A ground- and breathtaking paper on the difficulties arising when composing specifications of subsystems to get a specification of the composed system. It is a formal investigation of the exact formulation of the composition principle for concurrent systems. A system is here something that interacts with an environment over a well-defined boundary. A specification of a system here is a set of behaviors at the boundary where the environment and the system alternately take steps. Steps of the system can contain stuttering steps (i.e., steps where the state of the interface does not change) and the environment makes the first move. A specification can be expressed by $E\Rightarrow M$ where $E$ is an assumption about the environment and $M$ is the property guaranteed by the system. This is the understanding of the transition-axiom approach \cite{Lamport:1989:SAS}. The composition principle states that the composition $S$ of systems $S_1,\ldots,S_n$ satisfy a specification $E\Rightarrow M$ if three conditions hold: (1) $S$ guarantees $M$ if every $S_i$ guarantees its own $M_i$. (2) If $E$ holds and every $S_i$ guarantees $M_i$ then $E_i$ holds for every $S_i$. (3) Every $S_i$ guarantees $M_i$ if $E_i$ holds. There is an obvious circularity here because every component is part of the environment of the other. The main result states that the composition principle is valid if the environment assumptions are safety properties. The paper contains a lot of insightfull discussions about related aspects of specifications and programming: state vs. action based formalisms are compared in section 1.1, the distinction between system and environment is treated in sections 1.2 and 1.3. Section 3 contains an elaborated discussion on realizability of specifications and Section 4 details on the form of a specification. It examines what makes up a complete or a partial program and what difficulties arise in composition. For example, progress properties are inherent part of programs, but are often states implicitly as an incrementation of the program counter or fairness assumptions. Formally, progress properties are defined using the term `machine realizable', meaning something like `it doesn't add additional safety properties'. A specification then is a formula $I\cap E_S\cap E_L\Rightarrow M_S\cap M_L$, where $E_S$ and $M_S$ are the safety properties of the environment and the system respectively, and $E_L$ and $M_L$ are their liveness properties; $I$ is an initial state predicate on the environment state. Theorem 1 shows that $E_L$ can be incorporated into the system's liveness property, resulting in a specification being $I\cap E_S\Rightarrow M_S\cap (E_L\Rightarrow M_L)$. This means that the prerequisistes of the composition principle are achievable. Section 4.4 states also that everything can be moved to the right hand side of the implication. Then the specification does not only specify wanted behaviour but also allows arbitrary behavior if $E_S$ is not met. The authors argue that this is impractical.} } @ARTICLE{Afek:1993:SSU, AUTHOR = "Yehuda Afek and Geoffrey M Brown", TITLE = "Self-stabilization over unreliable communication media", JOURNAL = dc, VOLUME = 7, YEAR = 1993, PAGES = "27--34" } @ARTICLE{Arora:1993:CCF, AUTHOR = "Anish Arora and Mohamed Gouda", TITLE = "Closure and convergence: {A} foundation of fault-tolerant computing", JOURNAL = j-IEEE-TRANS-SOFTW-ENG, VOLUME = 19, NUMBER = 11, YEAR = 1993, PAGES = "1015--1027" } @Article{Arora:1993:CIS, author = "Anish Arora and Paul Attie and Michael Evangelist and Mohamed Gouda", title = "Convergence of iteration systems", OPTcrossref = "", OPTkey = "", journal = j-DC, year = "1993", volume = "7", number = "1", pages = "43--53", OPTmonth = "", OPTnote = "", OPTannote = "" } @INPROCEEDINGS{Anagnostou:1993:TTP, AUTHOR = "Efthymios Anagnostou and Vassos Hadzilacos", TITLE = "Tolerating transient and permanent Failures", BOOKTITLE = pro-wdag93, YEAR = 1993, PAGES = "174--188", annote = "The authors investigate the classes of problems which are solvable in the presence of transient and permanent failures. They begin by stating the self-stabilization has been the domain of research on tolerating transient failures which manifest themselves as arbitrary memory corruptions. On the other hand, fault tolerance has focussed on permanent failures such as process crashes. While transient failures could effect all processes, permanent failures were restrcicted to a certain subset of processes (usually half or one third of all processes). The authors show that tolerating transient and permanent failures is impossible in asynchronous systems for all problems which are ``failure sensitive''. Failure sensitive problems are such that it is vitally important to know whether a process has crashed or not. Examples for failure sensitive problems are leader election, consensus and spanning tree construction. As an example for a solvable problem they give an algorithm for unique naming in ring networks. These results give insight into the fundamental distinction between transient and permanent failures: transient failures are detectable in asynchronous systems, permanent ones are not. But the impossibility results are not too devastating since election and consensus are unsolvable in asynchronous systems anyway \cite{Fischer:1985:IDC}." } @InCollection{Babaoglu:1993:CGS, author = "{\"O}zalp Babao\u{g}lu and Keith Marzullo", title = {Consistent global states of distributed systems: {Fundamental} concepts and mechanisms}, booktitle = {Distributed Systems}, crossref = {Mullender:1993:DS}, OPTkey = {}, publisher = pub-AW, year = {1993}, editor = {Sape Mullender}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, chapter = {4}, OPTtype = {}, OPTaddress = {}, edition = {Second}, OPTmonth = {}, pages = {55--96}, OPTnote = {}, annote = "A well written survey on the theory of consistent global states. It is well suited as an introductory text for lectures on causality, distributed computations, snapshots, observations and predicate detection. A more research oriented text is \cite{Schwarz:1994:DCR}." } @Article{Barborak:1993:CPF, author = "Michael Barborak and Anton Dahbura and Miroslaw Malek", title = "The consensus problem in fault-tolerant computing", OPTcrossref = "", OPTkey = "", journal = j-ACM-COMP-SURVEYS, year = "1993", volume = "25", number = "2", pages = "171--220", month = jun, OPTnote = "", annote = "This paper surveys research on the consensus problem, comparing and unifying the two traditional approaches, which are (1) system diagnosis and (2) the Byzantine Generals Problem (BGP). Approach (1) tries to reach a constistent state by letting nodes diagnose eachother and infer from the resulting set of results (together with additional assumptions) which nodes are faulty. These nodes can then be avoided or shut down. Approach (2) applies distributed algorithms that reach nontrivial consensus on a single value in spite of possibly malicious faulty nodes within the network. Fault are thus masked. The paper is a near-to-complete survey of research up to about 1993, rather technical but exact." } @INPROCEEDINGS{Berrou:1993:NSL, AUTHOR = "C. Berrou and A. Glavieux and P. Thitimajshima", TITLE = "Near {S}hannon Limit Error-Correcting Coding and Decoding: Turbo Codes", PAGES = "1064-1070", booktitle = "IEEE Int. Conf. on Communications (ICC-1993)", year = 1993, annote = "the basic reference to the term `turbo codes'." } @Article{Birman:1993:PGA, author = "K. P. Birman", title = "The Process Group Approach to Reliable Distributed Computing", journal = j-CACM, volume = "36", number = "12", pages = "36--53", year = "1993", OPTkeywords = "ISIS, process groups, replicated processes", } @Article{Bowen:1993:SCS, author = {Jonathan Bowen and Victoria Stravridou}, title = {Safety-critical systems, formal methods and standards}, journal = {IEE/BCS Software Engineering Journal}, year = {1993}, OPTkey = {}, volume = {8}, number = {4}, pages = {189--209}, month = jul, OPTnote = {}, annote = {A well-written survey of the use of formal methods in industry for the design and implementation of safety critical systems as of 1992 (should be read in conjunction with \cite{Rushby:1994:CSP}). A source for lots of citations on the importance of dependability and ways to achieve it. I especially like the introduction ``Human lives have depended on mathematical calculations for centuries\ldots'' where Babbage is shown to be one of the first researchers in computer dependability. Gives examples in the fields of aviation, railway systems, nuclear power plants, medical systems, ammunition control and embedded microprocessors. Standards are rather UK centric. Great bibliography.} } @InProceedings{Diehl:1993:RAD, author = "Claire Diehl and Claude Jard and Jean-Xavier Rampon", title = "Reachability Analysis on Distributed Executions", pages = "629--643", year = "1993", month = apr # "~13--17,", editor = "Marie-Claude Gaudel and Jean-Pierre Jouannaud", booktitle = "Proceedings of the 4th International Joint Conference on Theory and Practice of Software Development {TAPSOFT}'93", address = "Orsay, France", series = ser-LNCS, number = "668", publisher = pub-SV, annote = "[to read]" } @ARTICLE{Dolev:1993:SDS, AUTHOR = "Shlomi Dolev and Amos Israeli and Shlomo Moran", TITLE = "Self-stabilization of dynamic systems assuming only read/write atomicity", JOURNAL = j-DC, VOLUME = 7, YEAR = 1993, PAGES = "3--16", annote = "describes fair protocol combination, i.e., composition of self-stabilizing protocols." } @InProceedings{Dolev:1993:WCS, title = "Wait-Free Clock Synchronization (Extended Abstract)", author = "Shlomi Dolev and Jennifer L. Welch", pages = "97--108", booktitle = pro-podc93, address = "Ithaca, New York, USA", month = aug, year = "1993", annote = "The problem solved is the following: build an algorithm that guarantees that for some fixed $k$ a processor P which has been working correctly for $k$ time units (and as long as it continues to work correctly) satisfies: (1) P's clock ticks normally (i.e., it is not adjusted), and (2) P's clock agrees with the clocks of all other processes which have been working correctly for the last $k$ time periods. The algorithm should handle any form of transient failures as well as ``napping'' failures, i.e., processors stop operation for arbitrary long times and then resume work without noticing that they have stopped. A protocol that achieves this goal in the presence of napping failures is called wait-free. The authors present four such algorithms for different system settings (non/assumption of global pulse, global/local read/write atomicity etc.). Two of these protocols are both wait-free and self-stabilizing. Clocks seem to be unbounded. " } @Book{Freyermuth:1993:WFB, author = {B. Freyermuth}, ALTeditor = {}, title = {{Wissensbasierte Fehlerdiagnose am Beispiel eines Industrieroboters}}, publisher = {VDI-Verlag}, year = {1993}, OPTkey = {}, OPTvolume = {}, number = {315}, series = {Fortschr.-Ber. VDI Reihe 8}, address = {{D\"usseldorf}}, OPTedition = {}, OPTmonth = {}, note = {Dissertation TH Darmstadt}, annote = {[Angabe von Armin]} } @InProceedings{Gopal:1993:USF, author = {Ajei S. Gopal and Kenneth J. Perry}, title = {Unifying self-stabilization and fault-tolerance}, booktitle = pro-podc93, year = {1993}, publisher = {ACM Press}, pages = {195--206}, annote = "The authors explore the possibility of building protocols that tolerate transient (``systemic'') as well as permanent (``process'') failures. They arrive at similar conclusions as Anagnostou and Hadzilacos \cite{Anagnostou:1993:TTP}: there are no protocols that can solve general problems in finite stabilization time because it is impossible to distinguish a crashed process from one that continually experiences send omission failures. Even more, the process which cannot send messages does not know whether it can communicate or not because of its inability to determine how it arrived at its present state. It is however possible to solve problems if ``solvability'' is restricted to the communicating (or functioning) subset of processes. These results count for synchronous (round based) protocols. The idea of problem solving in the presence of transient faults is: never terminate and regularly purge your computation history. In the paper, also asynchronous systems are examined: the authors present a self-stabilizing eventually strong failure detector based on an eventually weak failure detector. This failure detector can help solve consensus in transient fault environments. It uses unbounded counters and resembles very much the Heartbeat failure detector \cite{Aguilera:1997:HTF}." } @article{Gumm:1993:AGA, author = "H. Peter Gumm", title = {Another glance at the {Alpern-Schneider} characterization of safety and liveness in concurrent executions}, journal = j-IPL, volume = "47", number = "6", pages = "291--294", year = "1993", url = "citeseer.nj.nec.com/gumm93another.html", annote = "Revisits the Alpern-Schneider result \cite{Alpern:1985:DL} on ``every property is the intersection of a safety and liveness property'' in a more abstract setting: The result is restated in the context of a meet-preserving map between two complete Boolean algebras. The theorem is more general than Alpern-Schneider since it allows a new application in a simplified setting of UNITY style logics \cite{Chandy:1988:PPD}: safety properties are those where a set of transitions is forbidden. This is similar to the fusion-closedness assumption on specifications of \cite{Arora:1998:CDM}." } @InCollection{Hadzilacos:1993:FTB, author = {Vassos Hadzilacos and Sam Toueg}, title = {Fault-tolerant broadcasts and related problems}, booktitle = {Distributed Systems}, crossref = "Mullender:1993:DS", OPTkey = {}, publisher = pub-AW, year = {1993}, editor = {Sape Mullender}, chapter = {5}, edition = {Second}, pages = {97--145} } @ARTICLE{Katz:1993:SEM, AUTHOR = "Shmuel Katz and Kenneth J. Perry", TITLE = "Self-stabilizing extensions for message-passing systems", JOURNAL = j-DC, VOLUME = 7, YEAR = 1993, PAGES = "17--26", annote = "[to write]" } @InCollection{Kopetz:1993:RTD, author = {Hermann Kopetz and Paulo Ver{\'\i}ssimo}, title = {Real Time and Dependability Concepts}, booktitle = {Distributed Systems}, crossref = {Mullender:1993:DS}, OPTkey = {}, pages = {411--446}, publisher = pub-AW, year = {1993}, editor = {Sape Mullender}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTtype = {}, chapter = {16}, OPTaddress = {}, edition = {Second}, OPTmonth = {}, OPTnote = {}, annote = {gives informal overview over (among others) redundancy} } @InProceedings{Kurshan:1993:VM6, author = "R. P. Kurshan and L. Lamport", booktitle = "Proceedings of the 5th International Conference on Computer Aided Verification", year = "1993", editor = "C. Courcoubetis", address = "Elounda, Greece", series = ser-LNCS, volume = "697", publisher = pub-SV, title = "Verification of a Multiplier: 64 Bits and Beyond", pages = "166--179", } @InCollection{Liu:1993:SVR, author = {Zhiming Liu and Mathai Joseph}, title = {Specification and verification of recovery in asynchronous communicating systems}, booktitle = {Formal Techniques in Real-time and Fault-tolerant Systems}, OPTcrossref = {}, OPTkey = {}, pages = {137--165}, publisher = {Kluwer}, year = {1993}, editor = {Jan Vytopil}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTtype = {}, chapter = {6}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {A nice presentation of transformation based fault-tolerance verification similar to \cite{Peled:1994:CFF}. The paper first nicely presents the formal prerequisites (states and behaviors, specifications and programs, refinement, asymchronous communication). Faults are modeled as a set of fault actions and a fault transformation, fault-tolerant refinement are defined as in \cite{Liu:1996:VFR,Liu:1995:FFF} although I like the presentation here most. Detection is not covered here; an error variable flags the detection of a physical fault. Fault tolerance is achieved by another form of transformation exemplified for the class of checkpointing and backward-recovery programs. Consistent checkpoints and rollback operations to the most recent checkpoint are treated in length and some Theorems about the sufficiency of this method are proved (reminds me of the optimality proof of \cite{Singhal:1995:OPA}). Failure during recovery is discussed: if recovery is fault-tolerant or not subject to faults, then recovery can be assumed atomic. Failures within recovery can be handled by restarting recovery when they are detected. Failures during checkpointing are handlable if we assume that there is at least one (initial) checkpoint available to which rollback is possible. The conclusions state that backward recovery will result in the satisfaction of a degraded specification. In open systems the repeated communication with the environment must not be neglected. The method of fault modeling is attributed to \cite{Cristian:1985:RAF}. This paper is seen as a generalization of this work and that of \cite{Schlichting:1983:FSP}.} } @InProceedings{Li:1993:FTD, author = {Pei-yu Li and Bruce McMillin}, title = {Fault-Tolerant Distributed Deadlock Detection/Resolution}, booktitle = {Proceedings of the 17th Annual International Computer Software and Applications Conference (COMPSAC'93)}, OPTcrossref = {}, OPTkey = {}, pages = {224--230}, year = 1993, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, month = nov, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {Also University of Missouri, Rolla, Department of Computer Science Technical Report Number CSC 92-04. This paper takes the fault diagnosis approach to failure detection (look at \cite{Barborak:1993:CPF} for an intro to system diagnosis). I'm not sure how this relates to the standard interpretation of failure detectors \cite{Chandra:1996:UFD}, but here eventually every correct process knows the identities of all failed processes. A deadlock detection algorithm is proposed using a priority based probe approach to find cycles in the wait-for graph of an application. It can only detect deadlocks if there is at most one process failure in a deadlock cycle (a result is cited why being better is not possible). I don't see where this restriction comes from. Overall a nice text giving a somewhat different view of detecting stable predicates. Does not cite \cite{Shah:1984:DSS} although that paper also does deadlock detection.} } @InProceedings{Lincoln:1993:FVA, title = "Formal Verification of an Algorithm for Interactive Consistency under a Hybrid Fault Model", author = "Patrick Lincoln and John Rushby", booktitle = "Computer-Aided Verification, CAV '93", editor = "Costas Courcoubetis", pages = "292--304", publisher = pub-SV, series = "Lecture Notes in Computer Science", volume = "697", month = jun # "/" # jul, year = "1993", address = "Elounda, Greece", annote = "Good cite for the term `hybrid fault model'." } @Book{Mullender:1993:DS, editor = "Sape Mullender", title = "Distributed Systems", publisher = pub-AW, edition = "Second", year = 1993, annote = "An excellent collection of substantial papers not only on the theoretical foundations of distributed systems (although these chapters are especially rewarding)." } @Article{Neiger:1993:SSC, title = "Simulating Synchronized Clocks and Common Knowledge in Distributed Systems", author = "Gil Neiger and Sam Toueg", area = "Distributed Computing", pages = "334--367", journal = "Journal of the ACM", month = apr, year = "1993", volume = "40", number = "2", annote = "[to read]" } @InProceedings{Nordahl:1993:DFD, author = {Jens Nordahl}, title = {Design for dependability}, booktitle = {Proceedings of the third IFIP International Working Conference on Dependable Computing for Critical Applications (DCCA-3)}, OPTcrossref = {}, OPTkey = {}, pages = {29--38}, year = {1993}, editor = {Carl E. Landwehr}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTmonth = {}, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {Nordahl shows how to verify that a system consisting of subcomponents can be proved correct in the presence of component failures. Three concepts are basic: (1) design, (2) correctness of design and (3) failure mode. A system is a process (here, CSP \cite{Hoare:1984:CSP} is used as a formailsm throughout; we use the terminology of Lamport \cite{Lamport:1989:SAS} in this annotation). A specification is a property. A process $P$ implements a specification $S$ if all executions of $P$ are contained in $S$. A distinction between processes and specifications is made but it is remarked that this is not a central requirement (so other formalisms such as \cite{Pnueli:1981:TSC} can be used). A system can consist of a collection of subsystems (or components). A design determines how the components interact. (1) A design is a tuple consisting of a function mapping $n$ systems to a (new) system and a set of $n$ subsystem specifications. The function can be some composition operator (parallel, sequential etc. and compositions of these operators). A design contains enough information to reason about the corresponding hierarchical level. (2) A design is correct regarding a specification $S$ iff the combined system satisfies $S$ whenever the subcomponents satisfy their specification. (3) A failure mode is a specification describing the behavior of a system when it is faulty. Such a failure mode may be given by a component designer when making assumptions at design time or it may be derived by an engineer from observing faulty system behavior at runtime. For example, a failure mode for Byzantine behaviour is the predicate true. Components can have several failure modes, and for $n$ components this is expressed as an $n$-tuple $(F_1,\ldots,F_n)$ of sets of failure modes. For one combination of failure modes $(S_1,\ldots,S_n)$ one can prove that a design involving these subcomponents is correct regarding some system specification $S$. This can be extended to cover all possible combinations of component failure modes (e.g. to show that the system satisfies $S$ in any case). Two notions of fault-tolerance are defined: masking fault tolerance (calles `fault-tolerance') and fail-softness. A system design is fault-tolerant if it is correct regarding an $n$-tuple of component failure modes and the original correctness specification $S$. Fail-softness is defined as fault tolerance where $S$ is replaced by some weaker specification (which one to choose is a pragmatic issue, says Nordahl). The proof of correctness of design and fault tolerance can now be performed in the same logical framework as before. An example (stand by spare system) is given and proved. The conclusions discuss the following aspects: (a) the faulty behavior of a components is not given as a ``delta'' of its original correctness specification and a description of faulty behavior, but rather as a ``finished'' specification (i.e. a failure mode). Another approach is to calculate the weakened specification from the original specification and a failure model (such methods are \cite{Liu:1992:TPF,Peleska:1991:DVF,Gaertner:1999:ESD}). The disadvantage of the calculational approach is the necessity of calculations and the restrictions imposed on specifying faulty behavior. (I think both are equivalent.) (b) compositionality is achieved by defining fault tolerance of a design as a function of a single combination of subcomponent failure modes. Global assumptions about what combinations may arise can be dealt with at a higher level. (c) calculating the likelihood of failure can be integrated into the method quite easily by associating probabilistic measures to combinations of failure modes. Overall this is a very concise and well-written paper.} } @InProceedings{Ricciardi:1993:UPN, author = "Aleta Ricciardi and {Andr\'e} Schiper and Kenneth Birman", title = "Understanding partitions and the ``no partition'' assumption", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", OPTpages = "", booktitle = "Proceedings of the 4th Workshop on Future Trends of Distributed Computing Systems (FTDCS-4)", year = "1993", OPTorganization = "", OPTpublisher = "", OPTaddress = "", OPTmonth = "", OPTnote = "", annote = "[to read]" } @InProceedings{Schepers:1993:CPT, author = "R. Gerth and H. Schepers", title = "A Compositional Proof Theory for Fault Tolerant Real-Time Distributed Systems", pages = "34--43", booktitle = "Symposium on Reliable Distributed Systems ({SRDS} '93)", month = oct, publisher = "IEEE Computer Society Press", address = "Los Alamitos, Ca., USA", year = "1993", ISBN = "0-8186-4310-2", annote = "[to get] Extends the work of \cite{Schepers:1994:TCP} to real time." } @InProceedings{Schepers:1993:TFT, author = {Henk Schepers}, title = {Tracing Fault Tolerance}, booktitle = {Proceedings of the third IFIP International Working Conference on Dependable Computing for Critical Applications (DCCA-3)}, OPTcrossref = {}, OPTkey = {}, pages = {39--48}, year = {1993}, editor = {Carl E. Landwehr}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTmonth = {}, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {Basis to this paper is a system of sequential processes communicating via synchronous unidirected channels much in the sense of CSP \cite{Hoare:1984:CSP}. The semantics of a process are the set of possible message sequences (called histories or behaviors) at its observable channels. The set of all histories is divided into normal, exceptional and catastrophic bevaviors. Normal and exceptional ones are acceptable, and only these are covered by fault tolerance mechanisms. Catastrophic behaviors fall outside of the fault hypothesis. A fault hypothesis is a reflexive relation on histories defining how the fault changes the history (i.e. a relation on normal behaviors and exceptional behaviors). A set of behaviors with respect to a fault hypothesis is obtained by augmenting the original set of traces according to the fault hypothesis relation. Reflexivity ensures that only traces are added (none removed) from the original set of traces. To prove that a system tolerates some fault hypothesis one must show, that the composition of the original system running under some fault hypothesis and some tolerance mechanism satisfies the original correctness specification. The examples given are a communication channel which may lose or corrupt messages, and a ``stable disk''. Only safety properties are investigated. The conclusions contain a good survey of formal methods in fault tolerance up to 1993: Christian \cite{Cristian:1985:RAF} is cited as the first to separate normal specification from tolerance specification. Formalisms in which faults are treated explicitly are \cite{Weber:1989:FSF,Joseph:1987:PRF,Peleska:1991:DVF}. The final sentence is: ``We currently investigate modeling graceful degradation as switching to another, less ambitious, set of acceptable histories.'' For this, see \cite{Herlihy:1991:SGD}. An extended version appeared as \cite{Schepers:1994:TCP}.} } @InProceedings{Schiper:1993:VSC, author = "{Andr\'e} Schiper and Aleta Riccardi", title = "Virtually-synchronous communication based on a weak failure suspector", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "534--543", booktitle = pro-ftcs93, year = "1993", OPTorganization = "", OPTpublisher = "", OPTaddress = "", OPTmonth = "", OPTnote = "", annote = "A (quite mind-blowing) paper with lots of notation and definitions on how to implement a group membership service with certain semantics in asynchronous environments. The information that a process has about the functional states of the other processes in the group is called its view. Membership services that allow only a single view to exist in the system are said to have linear semantics. Those which allow concurrent views have either weak-partial (views may overlap) or strong partial (views may not overlap) semantics. The paper shows that strong partial semantics are related to virtually synchronous communication (VSC), however, an intuitive definition of VSC is not readily given. The authors propose a three-component architecture for implementing VSC in asynchronous systems: a weak failure suspector forms the basis for a view and a mulicast component, which interact on a higher level. The failure suspector has weak completeness and the accuracy is ensured by having either forcefully crashing the suspected process or by ensuring that the suspected process equally suspects the suspecting process. Crashed processes can recover but are thereafter new processes with new process identities. The failure suspector used here does not seem to fit into the scheme of Chandra and Toeug \cite{Chandra:1996:UFD}." } @ARTICLE{Schneider:1993:SS, AUTHOR = "Marco Schneider", TITLE = "Self-stabilization", JOURNAL = j-ACM-COMP-SURVEYS, VOLUME = 25, number = 1, YEAR = 1993, PAGES = "45--67", annote = "Standard reference survey on self-stabilization, nearly always cited together with Dijkstra \cite{Dijkstra:1974:SSS}." } @InCollection{Schneider:1993:WGM, author = {Fred B. Schneider}, title = "What good are models and what models are good?", booktitle = "Distributed Systems", OPTcrossref = {Mullender:1993:DS}, publisher = pub-AW, year = 1993, editor = {Sape Mullender}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, chapter = {2}, OPTtype = {}, OPTaddress = {}, edition = {Second}, OPTmonth = {}, pages = {17--26}, OPTnote = {}, OPTannote = {} } @PhdThesis{Varghese:1993:SLC, author = {George Varghese}, title = {Self-stabilization by local checking and correction}, school = {MIT}, year = {1993}, OPTkey = {}, OPTaddress = {}, OPTtype = {}, OPTmonth = {}, note = {Published as Technical Report MIT/LCS/TR-583}, OPTannote = {to write} } @InCollection{Verissimo:1993:RTC, author = {Paulo Ver{\'\i}ssimo}, title = {Real-time communication}, booktitle = {Distributed Systems}, crossref = {Mullender:1993:DS}, OPTkey = {}, pages = {447--490}, publisher = pub-AW, year = {1993}, editor = {Sape Mullender}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTtype = {}, chapter = {17}, OPTaddress = {}, edition = {Second}, OPTmonth = {}, OPTnote = {}, annote = {u.a. defines steadyness und tightness} } @Article{Abadi:1994:OFR, author = "Mart\'{\i}n Abadi and Leslie Lamport", title = "An Old-Fashioned Recipe for Real Time", journal = j-TOPLAS, volume = "16", number = "5", pages = "1543--1571", month = sep, year = "1994", url = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/186058.html", abstract = "Traditional methods for specifying and reasoning about concurrent systems work for real-time systems. Using TLA (the temporal logic of actions), we illustrate how they work with the examples of a queue and of a mutual-exclusion protocol. In general, two problems must be addressed: avoiding the real-time programming version of Zeno's paradox, and coping with circularities when composing real-time assumption/guarantee specifications. Their solutions rest on properties of machine closure and realizability.", keywords = "theory; verification", subject = "{\bf F.3.1}: Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Specifying and Verifying and Reasoning about Programs, Specification techniques. {\bf D.2.4}: Software, SOFTWARE ENGINEERING, Program Verification, Correctness proofs.", annote = "[to get]" } @Article{Afek:1994:RCU, title = "Reliable Communication Over Unreliable Channels", author = "Yehuda Afek and Hagit Attiya and Alan Fekete and Michael Fischer and Nancy Lynch and Yishay Mansour and Dai-Wei Wang and Lenore Zuck", pages = "1267--1297", journal = "Journal of the ACM", month = nov, year = "1994", volume = "41", number = "6", annote = "[to read]" } @InProceedings{Alur:1994:FF, title = "Finitary Fairness", author = "Rajeev Alur and Thomas Henzinger", pages = "52--61", booktitle = "Proceedings, Ninth Annual {IEEE} Symposium on Logic in Computer Science", year = "1994", month = "4--7 " # jul, address = "Paris, France", organization = "IEEE Computer Society Press", references = "{STOC::AlurAT1994} {JACM::BrachaT1985} {JACM::DworkLS1988} {JACM::FischerLP1985} {JACM::PeaseSL1980}", annote = "Introduces the term finitary fairness: requires that for every run f the system there is an unknown bound $k$ such that no enabled transition is postponed more than $k$ consecutive times. Cited and discussed in \cite{Merritt:1998:FSO}." } @INPROCEEDINGS{Arora:1994:CSB, AUTHOR = "Anish Arora and Mohamed G. Gouda and George Varghese", TITLE = "Constraint satisfaction as a basis for designing nonmasking fault-tolerance", BOOKTITLE = pro-icdcs94, YEAR = 1994, PAGES = "424--431", annote = "Important paper on self-stabilization methodologies. Has many relations to Varghese's thesis \cite{Varghese:1993:SLC}. Published as a more citeable Journal version \cite{Arora:1996:CSB}." } @ARTICLE{Arora:1994:DR, AUTHOR = "Anish Arora and Mohamed G. Gouda", TITLE = "Distributed reset", JOURNAL = j-IEEE-TRANS-COMP, VOLUME = 43, NUMBER = 9, MONTH = sep, YEAR = 1994, PAGES = "1026--1038", annote = "" } @InProceedings{Arora:1994:ERT, author = "Anish Arora", title = "Efficient Reconfiguration of Trees: {A} Case Study in Methodical Design of Nonmasking Fault-Tolerant Programs", booktitle = "Proceedings of the 3rd International Symposium on Formal Techniques in Real-Time and Fault-Tolerant Systems (FTRTFTS'94)", year = "1994", editor = "{H. Langmaack} and {W.-P. de Roever} and {J. Vytopil}", pages = "110--127", organization = "Organized Jointly with the Working Group Provably Correct Systems-ProCoS", volume = "863", series = ser-LNCS, publisher = pub-SV, address = "L{\"u}beck, Germany", month = sep, annote = "An application of the method of constraint satisfaction \cite{Arora:1994:CSB} to the problem of maintaining a rooted spanning tree in a network of nodes that may failstop, recover and where links may go down temporarily. Contains a brief discussion on the benefits of nonmasking fault tolerance. Shows that the concept of stabilization can handle ``permanent'' faults as well." } @INPROCEEDINGS{Awerbuch:1994:SLC, AUTHOR = "Baruch Awerbuch and Boaz Patt-Shamir and George Varghese and Shlomi Dolev", TITLE = "Self-stabilizing by local checking and global reset", BOOKTITLE = pro-wdag94, YEAR = 1994, PAGES = "326--339", annote = "to write" } @Article{Chandrasekar:1994:ASA, author = "Srinivasan Chandrasekar and Pradip K. Srimani", title = "A self-stabilizing algorithm to synchronize digital clocks in a distributed system", journal = "Computers and Electrical Engineering", volume = "20", number = "6", year = "1994", pages = "439--444", annote = "Focusses on maintaining ``hardware'' clocks in step. Takes the selb-stabilization view (as done by \cite{Gouda:1990:SU,Arora:1991:MDS}). This means that nodes access neighboring states by reading variables. Thus it abstracts from message passing and physical clock drift." } @InProceedings{Cristian:1994:AFT, author = "Flaviu Cristian", title = "Abstractions for Fault-Tolerance", pages = "278--286", ISBN = "0-444-81988-6", editor = "Karen Duncan and Karl Krueger", booktitle = "Proceedings of the {IFIP} 13th World Computer Congress. Volume 3 : Linkage and Developing Countries", month = aug, publisher = "Elsevier Science Publishers", address = "Amsterdam, The Netherlands", year = "1994", annote = "The author presents some fundamental concepts of fault tolerance and uses them to discuss several current paradigms of fault tolerant computing. Basic concepts include notions of service, server, the depends-upon relation, failure classification, failure semantics, failure masking by hierarchical masking or by group masking. The fault tolerant services discussed are: duplicated processors with matching to provide crash failure semantics, error detection/correction codes in stable storage to provide read omission failure semantics, restartable servers, point-to-point communication services, distributed storage services, restartable services, replicated storage and servers. Overall a paper along the masking fault tolerance perspective as in \cite{Cristian:1991:UFD}. When redundancy is not available anymore, ``users must have some manegable form of system behaviour that they can handle without too much pain.'' Interesting are the two laws of fault tolerance: First law: ``The stronger a specified failure semantics, the more expensive and complex it is to build a server that implements it.'' Second law: ``The weaker the failure semantics of members and communication, the more complex and expensive the group management mechanisms become.'' Are these laws useful?" } @Article{Cristian:1994:CHW, author = {Flaviu Cristian and Richard de Beijer and Shivakant Mishra}, title = {Comparing how well asynchronous atomic broadcast protocols perform}, journal = {Distributed Systems Engineering Journal}, year = {1994}, OPTkey = {}, volume = {1}, number = {4}, pages = {177--201}, OPTmonth = {}, OPTnote = {}, annote = {[to read] Title of the TR: A performance comparison of asynchronous atomic broadcast protocols.} } @InCollection{Cristian:1994:CSP, author = "Flaviu Cristian and Houtan Aghili and Ray Strong", editor = "Zhonghua Yang and T. Anthony Marsland", title = "Clock Synchronization in the Presence of Omission and Performance Failures, and Processor Joins", booktitle = "Global States and Time in Distributed Systems, IEEE Computer Society Press", year = "1994", annote = "A revised version of \cite{Cristian:1986:CSP}. Gives a simplified version of the protocol of \cite{Dolev:1995:DFC}, possible by reducing the types of failures assumed to occur. Here, only omission and performance failures are taken into account that do not partition the network. The algorithm is based on the paradigm of message diffusion. It assumes a maximum message delivery delay and a bounded drift rate of hardware clocks. It is mentioned that the MTTF of modern quartz clocks exceeds 15 to 25 years, military versions even of hundreds of years. Overall, a paper showing that a weaker failure model results in simpler protocols." } @INCOLLECTION{Flatebo:1994:SSD, AUTHOR = "Mitchell Flatebo and Ajoy Kumar Datta and Sukumar Ghosh", TITLE = "Self-stabilization in distributed systems", BOOKTITLE = "Readings in Distributed Computing Systems", PUBLISHER = "IEEE Computer Society Press", YEAR = 1994, CHAPTER = 2, PAGES = "100--114", NOTE = "T.L. Casavant and M. Singal, Editors" } @Article{Garg:1994:DWU, author = {V. K. Garg and Brian Waldecker}, title = {Detection of weak unstable predicates in distributed programs}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {1994}, OPTkey = {}, volume = {5}, number = {3}, OPTmonth = {}, pages = {299--307}, OPTnote = {}, annote = "Angaben aus \cite{Stoller:1997:DGP}." } @Article{Gouda:1994:SO, author = "Mohamed G. Gouda", title = "Stabilizing observers", journal = "Information Processing Letters", volume = "52", number = "2", pages = "99--103", day = "28", month = oct, year = "1994", keywords = "array of temperatures; boolean value; Convergence of numerical methods; Distributed computer systems; distributed processing; Error analysis; Fault tolerant computer systems; Observability; performance evaluation; Programmed control systems; sensors; stability; stabilizing observers; Stabilizing observers; Stabilizing phase synchronization; System stability; Uni-directional token systems", treatment = "P Practical; T Theoretical or Mathematical", } @TechReport{Hadzilacos:1994:MAF, title = "A Modular Approach to Fault-Tolerant Broadcasts and Related Problems", author = "Vassos Hadzilacos and Sam Toueg", number = "TR94-1425", year = "1994", month = may, institution = "Cornell University, Computer Science Department", pages = "83", annote = "Looks like an extended paper version of the chapter in Mullender's book on distributed systems \cite{Mullender:1993:DS}. The contents: While theoretical research in fault tolerant distributed computing has focussed mainly on solving the consensus problem, applied research has investigated reliable broadcasts. The authors show that both problems are closely related. They give several precise semantics of fault models (Sect. 2.3, e.g., they model crash failure by introducing an additional non-leavable crash state and corresponding state transitions) and a good definition of synchrony, asynchrony and partial synchrony of models (Sect. 2.4). Timing failures are also discussed (sec. 2.5). They develop a suite of broadcast specifications and algorithms seperately and in an incremental way which is very instructive. Types of broadcasts are: reliable broadcast, timed reliable broadcast, uniform reliable broadcast (which places restrictions on the operation of faulty processes) and certain order specifications (FIFO, causal, atomic). Finally, the relation between consensus and atomic broadcast is investigated: they show that atomic broadcast can be transformed into a consensus algorithm, and that reliable broadcast and consensus yield atomic broadcast (all in the time-free model with crashes). The paper also discusses terminating variants of reliable broadcast (where processes deliver messages consistently even if they weren't sent, e.g., as in Byzantine Agreement \cite{Lamport:1982:BGP}) and multicast specifications. Contains a reference to a ``forthcoming book'' on fundamentals of fault tolerant distributed computing \cite{Hadzilacos:FFT} which obviously has not been published yet. Overall a very rewarding paper suited for introductory courses on this topic." } @INPROCEEDINGS{Huang:1994:DEM, AUTHOR = "Shing-Tsaan Huang and Lih-Chyau Wuu and Ming-Shin Tsai", TITLE = "Distributed execution model for self-stabilizing systems", BOOKTITLE = "ICDCS94 Proceedings of the 14th International Conference on Distributed Computing Systems", YEAR = 1994, PAGES = "432--439", annote = "The authors introduce four categories of distributed system models (serial, synchronous, synchronized and distributed) and present a technique that makes verification of algorithms in the distributed model much easier once they have been proven correct for the serial model. [what's the idea behind this?]" } @Book{Isermann:1994:UEF, ALTauthor = {}, editor = {Rolf Isermann}, title = {{\"Uberwachung und Fehlerdiagnose --- Moderne Methoden und ihre Anwendungen bei technischen Systemen}}, publisher = {VDI-Verlag}, year = {1994}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {{D\"usseldorf}}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {[Angabe von Armin]} } @Book{Jalote:1994:FDS, author = "Pankaj Jalote", title = "Fault tolerance in distributed systems", publisher = pub-PH, year = 1994, OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", address = pub-PH:adr, OPTedition = "", OPTmonth = "", OPTnote = "", annote = "Fine self-contained overview over the area of fault tolerance in distributed systems. However, does not mention self-stabilization with a single word." } @Article{Kindler:1994:SLP, author = {Ekkart Kindler}, title = {Safety and Liveness Properties: {A} Survey}, journal = {EATCS-Bulletin}, year = {1994}, OPTkey = {}, OPTvolume = {}, number = {53}, OPTpages = {}, month = jun, OPTnote = {}, annote = {A brief (4 page) and very concise survey on the differences and historical evolution of different notions of safety and liveness.}, url = "\url{http://www.informatik.hu-berlin.de/~kindler/PostScript/EATCS53.ps}" } @Article{Lamport:1994:HTW, title = "How to Write a Long Formula (Short Communication)", author = "Leslie Lamport", journal = "Formal Aspects of Computing", volume = "6", number = "5", pages = "580--584", year = "1994", url = "http://www.research.digital.com/SRC/personal/Leslie_Lamport/proofs/src119.dvi.Z", annote = "Lamport proposes a structured and hierarchical way to write long mathematical formulas. Nested parentheses are replaced by proper indentation, and formulas with infix operators are used in a prefix operator style if they are long. Also, the cases construct and the use of definitions is discussed. The only unsurety is how to write implications. This text previously appeared as DEC SRC Research Report number 119.", } @InProceedings{Lamport:1994:SVF, author = "Leslie Lamport and Stephan Merz", title = "Specifying and Verifying Fault-Tolerant Systems", booktitle = "Formal Techniques in Real-Time and Fault-Tolerant Systems", year = "1994", editor = "{H. Langmaack} and {W.-P. de Roever} and {J. Vytopil}", pages = "41--76", OPTorganization = "Third International Symposium Organized Jointly with the Working Group Provably Correct Systems-ProCoS", volume = "863", series = ser-LNCS, publisher = pub-SV, address = "L{\"u}beck, Germany", month = sep, annote = "An in-length exposition of a formal proof of the oral messages algorithm to the Byzantine Generals Problem \cite{Lamport:1982:BGP}. The problem is specified on three different levels of abstraction: (1) a general and high level description of the process' behaviors, given that they are loyal, (2) a mid-level description containing the algorithm description, and (3) a low-level description specifying how message exchange works. Proofs are given that each lower level specification implements the next higher level specification including the correctness theorem at the mid-level: if at most one traitor exists, then the high level specification is implemented by the mid level specification. It is interesting that the global fault assumption appears at the mid-level, which is conform with the fault-tolerant refinement idea of \cite{Peled:1994:CFF}. The discussion contains some concrete arguments to why TLA and hierarchically structured proofs can help engineers prove systems correct up to an acceptable level of trust. By introducing real-time, only safety properties need to be proved, making aspects of the original Byzantine failure model more explicit." } @Article{Lamport:1994:TLA, author = "Leslie Lamport", title = "{The Temporal Logic of Actions}", journal = j-TOPLAS, volume = "16", number = "3", pages = "872--923", month = may, year = "1994", annote = "Main reference to the syntax, semantics and merits of TLA. A good and increasingly exact overview starting from small examples, introducing temporal operators, fairness, composition, refinement, proof methods and rules, reasons not to use types, hiding of variables and some very interesting comments on mechanical verification, TLA vs. conventional programming languages, and comparisons with related formalisms. For a shorter introduction read \cite{Lamport:1994:ITT}." } @TechReport{Lamport:1994:ITT, author = {Leslie Lamport}, title = {Introduction to TLA}, institution = {Digital Systems Research Center}, year = {1994}, OPTkey = {}, type = {Technical Note}, number = {1994-001}, address = {Palo Alto, CA}, month = dec, OPTnote = {}, annote = {A short and instructive primer of TLA omitting all the nitty gritty details. Starting point if you want to specify programs in TLA fast. Standard reference is \cite{Lamport:1994:TLA}.} } @InProceedings{Line:1994:MCS, author = "JC Line and S Ghosh", title = "A methodology for constructing a stabilizing crash-tolerant application", booktitle = pro-srds94, year = "1994", pages = "12--21", annote = "[to read]" } @InProceedings{Line:1994:SAD, author = "Jeffery C. Line and Sukumar Ghosh", title = "Stabilizing Algorithms for Diagnosing Crash Failures", pages = "376", booktitle = pro-podc94, month = aug, year = "1994", annote = "A simple stabilizing ``I am alive'' protocol is presented for diagnosing a single crash failure in at least strongly connected networks. The protocol assumes channels with finite capacities and bounded propagation delays. See also \cite{Arora:1995:TBS}." } @InCollection{Liu:1994:SDF, author = {Zhiming Liu and Mathai Joseph}, title = {Stepwise Development of Fault-Tolerant Reactive Systems}, booktitle = {Formal techniques in real-time and fault-tolerant systems}, OPTcrossref = {}, OPTkey = {}, pages = {529--546}, publisher = pub-SV, year = {1994}, OPTeditor = {H. Langmaack and W.-P. de Roever and J. Vytopil}, OPTvolume = {}, number = {863}, series = ser-LNCS, OPTtype = {}, OPTchapter = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {[to read]} } @InProceedings{Lo:1994:UFD, title = "Using Failure Detectors to Solve Consensus in Asynchronous Shared-Memory Systems (Extended Abstract)", author = "Wai-Kau Lo and Vassos Hadzilacos", booktitle = pro-wdag94, editor = "Gerard Tel and Paul M. B. Vit{\'a}nyi", address = "Terschelling, The Netherlands", month = "29~" # sep # "--1~" # oct, year = "1994", series = "Lecture Notes in Computer Science", volume = "857", publisher = pub-SV, ISBN = "ISBN 3-540-58449-8", pages = "280--295", annote = "[to read]" } @Book{Lynch:1994:AT, author = {Nancy A. Lynch and Michael Merritt and William Weihl and Alan Fekete}, title = {Atomic Transactions}, publisher = {Morgan Kaufmann, San Mateo, CA}, year = {1994}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {[to read] :-)} } @article{Peled:1994:CFF, author = {Doron Peled and Mathai Joseph}, title = {A Compositional Framework for Fault-tolerance by Specification Transformation }, journal = {Theoretical Computer Science }, volume = {128 }, year = {1994 }, pages = {99--125 }, annote = "A fault-tolerant program is viewed as a fault-intolerant program enhanced by some fault-tolerance/recovery mechanism (like in \cite{Arora:1998:CDM,Arora:1998:DCT}). This can be viewed as a program transformation, i.e. a function $T$ that maps a fault-intolerant program $P$ to a fault-tolerant version $P'=T(P)$. However, introducing a recovery mechanism alters the original specification $S$ of $P$ to some augmented specification $S'$ which takes the behavior of the tolerance mechanism into account. So the effects of a tolerance mechanism on $S$ can be regarded as a specifictation transformation $F$ which maps $S$ to $S'=F(S)$. A program transformation $T$ and a specification transformation $F$ correspond, if for all programs $P$ and for all properties $p$, if $p$ holds for $P$ then $F(p)$ holds for $T(P)$. Now, for some corresponding transformations $T$ and $F$, if some property $p$ holds for $P$ and $F(p)$ implies property $q$, then $q$ holds for $T(P)$. So properties about $T(P)$ can be proved without looking at the code of $T(P)$ if $T$ and $F$ correspond. A specification transformation $F$ is said to be compositionally complete with respect to a program transformation $T$ if all properties can be proved in this way. Criteria for compositional completeness are given and depend on the monotonicity and the expressiveness of the specification language. --- The methodology is exemplified by the example of forward and backward recovery of distributed computations. A recovery algorithm is proposed and its corresponding specification transformation defined which is divided in a fixed part (e.g., eventually a snapshot will be taken (liveness) and the tolerance mechanism does not interfere with the original computation (safety)) and a part depending on the original specification. Then the basic program is transformed into a fault-tolerant program $P'$ and then to a fault-tolerant program in a faulty environment (the methods behind this is described in \cite{Liu:1992:TPF}). Then some simple properties of the fault-tolerant program are verified by applying the transformation $F$ to them and using the above proof rule. These properties are usually weaker since faults may deem the original properties unachievable (they however do not say how to derive them in general). The authors give criteria how to verify that $T$ and $F$ actually correspond. They also discuss modularity issues: variables of the recovery algorithm can be omitted from the specification by a method of concealment. Fairness is an open problem since imposing a fault-tolerance mechanism and invoking it on faults can destroy fairness guarantees that held for the untransformed program. The paper uses interleaving semantics with a formalism coming from the area of (temporal) logic and concurrency. A fine paper." } @InProceedings{Ruget:1994:CMC, title = "Cheaper Matrix Clocks", author = "Fr{\'e}d{\'e}ric Ruget", booktitle = pro-wdag94, editor = "Gerard Tel and Paul M. B. Vit{\'a}nyi", address = "Terschelling, The Netherlands", month = "29~" # sep # "--1~" # oct, year = "1994", series = "Lecture Notes in Computer Science", volume = "857", publisher = pub-SV, ISBN = "ISBN 3-540-58449-8", pages = "355--369", annote = "[to read]" } @article{Rushby:1994:CSP, AUTHOR = {John Rushby}, TITLE = {Critical System Properties: Survey and Taxonomy}, JOURNAL = {Reliability Engineering and System Safety}, YEAR = 1994, VOLUME = 43, NUMBER = 2, PAGES = {189--219}, annote = "Although quite long, this is a very insightful and rewarding survey of various notions of ``critical systems'' from the broad literature. First, Rushby compares the four distinct approaches to critical systems that have emerged: (1) dependability/fault tolerance, (2) safety engineering, (3) secure systems, (4) real time systems. The dependability approach includes the usual notion of fault tolerance that a system should not deviate from its system specification if faults occur. The system specification can also be degraded resulting in a well-defined failure behavior (or failure semantics). The central method to achieve this is application of redundancy. Faults are categorized in fault models or failure semantics of subcomponents and there is a tradeoff between the fault types and the number of faults that can occur for a given level of redundancy. (``a quad-redundant Byzantine fault-tolerant system can withstand a single fault of any kind, whereas a differently organized quad-redundant system can withstand as many as three crash faults, but no other kind.'' [Hybrid models can help here.]) Managing redundancy requires coordination, which is difficult. Method to fight transient faults (self-stabilization) and design faults are also discussed. (2) The safety engineering approach is concerned with the occurrence of unplanned events. Safety means here that the system does no harm of any kind. Safety is achieved through hazard analysis (either reasoning backwards from a catastrophe or reasoning forward from a component failure). This can also be done for software, resulting in software fault tree analysis (SFTA). The advantage of this approach is that it explicitly considers the system context. A ``fail-safe'' operation is desired and achieved through a safe step-by-step operation based on a notion of locks (``lockin'', ``lockout''). While dependability ``tries to maximize the extent to which the system works well'', safety engineering ``tries to minimize the extent to which it can fail badly'' (p.13f). Thus dependability is natural in circumstances in which there is ``no safe alternative to normal service'' (like in aircraft control). (3) The secure systems approach holds up the protection of secrets and privacy. This includes a notion of integrity. Methods to achieve this are usually based on kernelization. This is analogous to fault containment in dependability. (4) The real-time systems approach needs to ensure deadlines and ``jitter'' (i.e. a certain quality of outputs). Real-time systems are organized as cyclic executives of a fixed number of processes in a fixed schedule (which has a number of disadvantages described on p. 20) or a preemptive and priority driven schemes that dominate today (especially a method called rate monotonic scheduling where priorities are derived from iteration rates). Both methods are compared on p. 23. There are relations especially between hard-real-time and masking fault tolerance. In Chapter 3 Rushby surveys formal models for critical system properties and assurance methods. These include formalizations of security (via access control mechanisms), fault tolerance and real time. Formal notions of properties are usually based on traces (although security for example can be seen as a higher level property, see p. 29). Fault tolerance formalizations are either calculational (like \cite{Arora:1993:CCF}), i.e.~they calculate the effects of faults and see whether resulting executions are still ``safe'', or specificational, i.e.~the fault-tolerance specification is composed of the failure semantics of the subsystems (like \cite{Herlihy:1991:SGD}). More references to the literature are given on p.~30. Formalizations of real-time properties are usually based on some form of temporal logic and model checking (there are also versions of such logics that take time intervals into account, see p.~33f). Assurance techniques must take random and systematic failures into account to calculate some reliabilility measure (which for critical systems is in the order of $10^{-9}$ probability of failure during one hour operation). Direct measurement and testing is ruled out because of these high demands (testing would require some 100.000 years to meet these measures). Calculational approaches on the other hand contain many (``only'', p.~37) subjective factors such as the examination of the lifecycle process. Formal methods can be used to guarantee formal correctness but nobody can give real evidence for attaching some reliability number (this is a good quote, p.~38). Finally, Rushby provides a taxonomy of critical system properties based on interaction and coupling which laxly said is the necessity of flexibility versus the flexibility offered by the system. Overall this is one of my top ten favourite papers because it offers an understandable overview with well-chosen and well-explained examples, written in fine language and without the usual academic high-nose. The pages refer to the printed version from the web page http://www.csl.sri.com/reports/html/csl-93-1.html" } @TechReport{Sabel:1994:SFS, title = "Simulating Fail-Stop in Asynchronous Distributed Systems", author = "Laura S. Sabel and Keith Marzullo", number = "TR94-1413", year = "1994", month = mar, institution = "Cornell University, Computer Science Department", pages = "24", abstract = "The fail-stop failure model appears frequently in the distributed systems literature. However, in an asynchronous distributed system, the fail-stop model cannot be implemented. In particular, it is impossible to reliably detect crash failures in an asynchronous system. In this paper, we show that it is possible to specify and implement a failure model that is indistinguishable from the fail-stop model from the point of view of any process within an asynchronous system. We give necessary conditions for a failure model to be indistinguishable from the fail-stop model, and derive lower bounds on the amount of process replication needed to implement such a failure model. We present a simple one-round protocol for implementing one such failure model, which we call simulated fail-stop.", annote = "Published as \cite{Sabel:1994:SFA} and at PoDC94. Not readily available on the net. See summary of \cite{Sabel:1994:SFA}." } @InProceedings{Sabel:1994:SFA, author = "Laura S. Sabel and Keith Marzullo", title = "Simulating Fail-Stop in Asynchronous Distributed Systems", pages = "138--147", booktitle = pro-srds94, month = oct, publisher = "IEEE Computer Society Press", address = "Los Alamitos, Ca., USA", year = "1994", annote = "Abstract in \cite{Sabel:1994:SFS}. The authors present a method how to ``implement'' the fail-stop failure model in asynchronous environments. Because this task is impossible, they give a version of a failure model that is indistinguishable from fail-stop and call it simulated fail-stop. The system model is based on the asynchronous crash model with reliable FIFO channels. Processes have a local `crash' variable and a `failed' vector which should reflect the `crash' values of all other processes. They define the failed-before relation in terms of these variables: i failed before j in a run iff at j failed[i] is true and remains true in that run. The indistinguishability of runs bases on the definitions of \cite{Chandy:1986:HPL}. The fail-stop failure model is defined using two conditions: (FS1) A processes failure is eventually detected by all processes that don't crash. (FS2) There are no false detections. The authors derive three necessary conditions for indistinguishability of FS: (C1) If a process i detects the crash of a process j, then eventually j will crash. (C2) The failed-before relation is acyclic. (C3) A crash event happens before no other event. These are not sufficient conditions, as shown by a run that meets C1--C3 and is distinguishable from FS. However, the authors give another set of sufficient conditions which are not all necessary: weakening FS1 is not possible because this may prevent progress, so FS2 is weakened into four condistions: (FS2a) If a process i suspects the crash of process j, then eventually j will crash; this in conjunction with FS1 this implies C1. (FS2b) The failed-before relation is acyclic; this is C2. (FS2c) A process never detects its own failure. (FS2d) Once i detects the failure of j, then all messages sent by i to any process k will not be recived until k has also detected the failure of j; c and d together imply C3. The authors give a simple protocol that implements these conditions. The central idea is to form an agreement on the suspicions by using intersecting quorum sets of processes. This mainly ensures C2. The size of such a quorum set must be strictly greater that $n(\frac{t-1}{t})$, where n is the number of processes and t is the maximum number of processes that may fail. The authors relate these results to the failure detector hierarchy of \cite{Chandra:1996:UFD}: the fail-stop model is equivalent to having a perfect failure detector (PFD), and the properties that are proposed are those of a strong failure detector (SFD). So while a PFD cannot be implemented by a SFD, an indistinguishable failure detector can be implemented. Here's a nice citation: ``A failure model describes the manner in which the components of a system can fail.'' (Sect. 3)" } @Article{Schepers:1994:TCP, author = "Henk Schepers and Jozef Hooman", title = "A trace-based compositional proof theory for fault tolerant distributed systems", journal = "Theoretical Computer Science", volume = "128", number = "1-2", pages = "127--157", day = "6", month = jun, year = "1994", corpsource = "Dept. of Math. and Comput. Sci., Eindhoven Univ. of Technol., Netherlands", keywords = "alternating bit protocol; compositional formalism; distributed processing; exceptional behaviour; failure hypothesis; fault tolerant computing; fault tolerant distributed systems; formal specification; formal verification; input behaviour; network completeness; output behaviour; reasoning; safety property specification; software reliability; soundness; theorem proving; trace-based compositional proof theory; triple modular redundant system", annote = "The authors introduce a rigorous formalism allowing to prove safety properties of fault tolerant systems. This is done by extending a formalism used to reason about normal behavior (such as \cite{Hoare:1984:CSP}) with a single rule by which a component specification is weakened to reflect its faulty behavior. Prerequisite is a precise characterization of faulty behavior, which is done using a reflexive relation on normal and faulty traces. The method is specificational \cite{Rushby:1994:CSP} and at the system interface level describing a specification transformation. Examples (stuck at zero, message corruption, message loss) are given. Formally, a failure hypothesis is a reflexive relation on normal behavior, preserving prefix closure and effecting only the components of the failed process. A failure hypothesis can be used to derive the faulty behavior of a system. Examples which are proved safe are TMR and the alternating bit protocol. The proof system is shown to be sound and complete (didn't look at the proofs). As said above, only safety properties are handled. Compositional reasoning about liveness is difficult \cite{Abadi:1993:CS}. Future work states that it would be nice to have a logic to express failure hypotheses more elegantly. \cite{Schepers:1993:CPT} extends this work to also cover real time. Overall an interesting paper, probably the Journal version of \cite{Schepers:1993:TFT}, citing all the prominent players of the time \cite{Joseph:1987:PRF,Liu:1993:SVR,Nordahl:1993:DFD,Peleska:1991:DVF,Weber:1989:FSF} and the Conference version of \cite{Peled:1994:CFF}." } @InProceedings{Schiper:1994:PPV, author = "A. Schiper and A. Sandoz", title = "Primary Partition ``Virtually-synchronous Communication'' Harder than Consensus", series = ser-LNCS, number = "857", pages = "39--52", year = "1994", booktitle = pro-wdag94, annote = "The authors formally define the primary partition virtually synchronous communication problem (PP-VSC) and show that it is harder to solve than consensus in the sense that PP-VSC is solvable whenever consensus is solvable but there are situations where consensus is solvable and PP-VSC is not. PP-VSC consists of 6 condistions that formalize the following intuition: views are sets of processes. in PP-VSC every process has the same view $V$ (as opposed to the partial VSC problem). Assume that a new view $V'$ has to be defined (because a process from $V$ is assumed to have crashed for example). Then all processes in both $V$ and $V'$ must have delivered the same set of messages in view $V$ before delivering the new view $V'$. The system model used is the asynchronous model enhanced with failure suspectors as defined by Chandra and Toueg \cite{Chandra:1996:UFD}." } @Article{Schiper:1994:SSP, title = "Strong Stable Properties in Distributed Systems", author = "Andr{\'e} Schiper and Alain Sandoz", journal = j-DC, pages = "93--103", year = "1994", volume = "8", number = "2", annote = "[to read]" } @Book{Schuessler:1994:DS, author = {H. W. {Sch\"u\ss{}ler}}, ALTeditor = {}, title = {{Digitale Signalverarbeitung}}, publisher = pub-SV, year = {1994}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Berlin}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {[Angabe von Armin]} } @Article{Schwarz:1994:DCR, author = "Reinhard Schwarz and Friedemann Mattern", title = {Detecting causal relationships in distributed computations: in search of the holy grail}, journal = j-DC, year = 1994, OPTkey = {}, volume = 7, OPTnumber = {}, OPTmonth = {}, pages = "149--174", OPTnote = {}, annote = "A well written and extensive survey about the intrinsic problems in detecting causal relationships in distributed systems. First, causality and vector time is explained and how both relate to the notion of real time. Then implementation aspects of vector time are discussed. Next, the authors focus on the evaluation of global predicates and show that the truth of such a predicate depends on the observer. Different modalities of predicates are surveyed (including the well known `possibly' and `definitely') and present a few algorithms for predicate detection. The bibliography section contains 74 (!) references, so this paper can be used as a starting point for own research. Overall, the authors manage to show that dealing with distributed systems is a complex and intriguing undertaking." } @Book{Tel:1994:IDA, author = {Gerard Tel}, title = {Introduction to Distributed Algorithms}, publisher = {Cambridge University Press}, year = 1994, } @Article{Walther:1994:OPT, author = {Christoph Walther}, title = {On Proving the Termination of Algorithms by Machine}, journal = {Artifical Intelligence}, year = {1994}, volume = {7}, pages = {101--157}, annote = {Walther presents a method to prove the termination of a class of normal sequential algorithms in a fully automatic fashion. The algorithms are formulated in a functional programming language and the idea behind this method seems to be to derive a well-founded ordering relation on recursive calls by some heuristics based on size reduction. His method produces hypothesis suitable for proving with an automatic theorem prover. The method handles only algorithms that ``strongly'' terminate (a definition I have not understood) and here not for all strongly terminating ones. However, strong termination is a practical restriction since all programs that do not have recursive calls in the conditions of cases and do not have nested recursive calls strongly terminate. The paper contains also an overview over older work on (automatic) termination proofs, such as a reference to Floyd's idea of termination functions \cite{Floyd:1967:AMP}, the first mentioning of the term ``convergence function'' \cite{Manna:1974:AAT} and comparison work of termination proving methods \cite{Katz:1975:CLT}. } } @Book{Yang:1994:GST, editor = {Zhonghua Yang and T. Anthony Marsland}, title = {Global States and Time in Distributed Systems}, publisher = {IEEE Computer Society Press}, year = {1994}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {Contains all the classics on the subject, e.g. \cite{Chandy:1985:DSD,Cooper:1991:CDG,Garg:1994:DWU,Mattern:1989:VTG}.} } @InCollection{??:1995:FLP, author = "???", title = "Summary of the discussion sessions: FLP and real time", OPTcrossref = "", OPTkey = "", booktitle = "Theory and Practice in Distributed Systems", publisher = pub-SV, year = "1995", editor = "K. P. Birman and F. Mattern and A. Schiper", OPTvolume = "", number = "938", series = ser-LNCS, OPTtype = "", OPTchapter = "", pages = "260--261", OPTaddress = "", OPTedition = "", OPTmonth = "", OPTnote = "", annote = "Summary of a discussion session from the Dagstuhl workshop on theory and practice of distributed systems. Discusses ways to circumvent the FLP result \cite{Fischer:1985:IDC} and the various notions of real time in distributed systems. This includes a mention of failure detectors, timing assumption coverage, real-time scheduling." } @Article{Abadi:1995:CS, author = "Mart{\'\i}n Abadi and Leslie Lamport", title = "Conjoining Specifications", journal = j-TOPLAS, volume = "17", number = "3", pages = "507--534", month = may, year = "1995", url = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/201069.html", abstract = "We show how to specify components of concurrent systems. The specification of a system is the conjunction of its components' specifications. Properties of the system are proved by reasoning about its components. We consider both the decomposition of a given system into parts, and the composition of given parts to form a system.", annote = "The basis of this and other research \cite{Abadi:1993:CS,Lamport:1989:SAS} is that programs and their specifications are formulas in a temporal logic (this idea is attributed to Pnueli \cite{Pnueli:1981:TSC}). If specifications allow stuttering steps, then $A\Rightarrow B$ asserts that $A$ implements $B$. So checking the correctness of a program can be done within the logic. Parallel composition can then be seen as conjunction. When dealing with composite systems there are two cases to consider: (1) when starting with a composite specification $M$ we want to decompose it into ``subcomponents'' $M_a$ and $M_b$ where $M_a\land M_b \Rightarrow M$. Decomposition usually results in slight modifications (due to communication) of $M_a$ and $M_b$ resulting in subcomponents $M_a^l$ and $M_b^l$. We want to prove that $M_a^l\land M_b^l\Rightarrow M_a\land M_b$, but unfortunately this involves reasoning about the full low level protocol. Rather we could make use of the fact that we have a decomposition and rather prove $M_a^l\Rightarrow M_a$ and $M_b^l\Rightarrow M_b$ to prove our result. But this is not always valid. The Decomposition Theorem on page 527 states that we can deduce $M_a^l\land M_b^l\Rightarrow M_a\land M_b$ from three things: (a) $E_a\land M_a^l\Rightarrow M_a$, (b) $E_b\land M_b^l\Rightarrow M_b$, and (c) $M_a\land M_b\Rightarrow E_a\land E_b$. (2) The second case to consider is when we start with a set of subcomponents and want to reason about the specification of the composed system. Given two components as an assumption/guarantee specification $E_a\Rightarrow M_a$ and $E_b\Rightarrow M_b$, then we would like to deduce that the composed system satisfies $M_a\land M_b$ if one is taken as the environment of the other. This reasoning is however only valid if $E_a$ and $E_b$ are safety properties. This fact is discussed more elaborately in \cite{Abadi:1993:CS}. The context in which this reasoning is done is TLA \cite{Lamport:1994:TLA}." } @InProceedings{Alvarez:1995:ODA, author = {Guillermo A. Alvarez and Flaviu Cristian and Shivakant Mishra}, title = {on-demand asynchronous atomic broadcast}, booktitle = {Proceedings of the 5th IFIP Working Conference on Dependable Computing and Critical Applications}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {1995}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Urbana-Champaign, IL}, month = sep, OPTorganization = {}, OPTpublisher = {}, url = "ftp://ftp.cs.ucsd.edu/pub/grad/galvarez/papers/ondemand.ps.Z", OPTnote = {}, annote = {Focusses on practical performance issues. [to read]} } @INPROCEEDINGS{Arora:1995:DMF, AUTHOR = "Anish Arora and Sandeep S. Kulkarni", TITLE = "Designing masking fault-tolerance via nonmasking fault-tolerance", BOOKTITLE = pro-srds95, YEAR = 1995, PAGES = "174--185", annote = "Appeared later in the IEEE Transactions on Software Engineering \cite{Arora:1998:DMF}." } @InProceedings{Arora:1995:ECC, author = "Anish Arora and Mohamed Gouda", title = "Load balancing: an exercise in constrained convergence", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "183--197", booktitle = pro-wdag95, year = "1995", OPTorganization = "", OPTpublisher = "", OPTaddress = "", OPTmonth = "", URL = "ftp://ftp.cis.ohio-state.edu/pub/anish/papers/load-balancing.ps.gz", annote = "Stepwise design of distributed load balancing algorithms from specifications using the paradigm of constrained convergence." } @InProceedings{Arora:1995:TBS, author = "Anish Arora and David M. Poduska", title = "A timing-based schema for stabilizing information exchange", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", OPTpages = "", booktitle = "Proceedings of the Third International Conference on Computer Networks, Tokyo, Japan", year = "1995", OPTorganization = "", OPTpublisher = "", OPTaddress = "", OPTmonth = "", OPTnote = "", annote = "The authors construct a sequel of increasingly complex stabilizing information exchange protocols: first a solution for the adjacency problem is given, which is extended to a connectivity protocol and finally to a general information exchange protocol that may be used to detect and establish certain predicates in the system. This schema can be used to implement commitment, leader election, spanning tree construction (i.e., all locally checkable specifications). It is closely related to the paradigm of information propagation. Interesting is the discussion of real time properties: the guarded command notation is extended to specify real time bounds on actions and methods for specifying and proving timeliness properties of algorithms are discussed. The underlying system model uses synchronized clocks and channels with bounded message delay." } @Article{Babaoglu:1995:SVD, author = "{\"Ozalp} {Babao\u{g}lu} and Michel Raynal", title = "Specification and Verification of Dynamic Properties in Distributed Computations", journal = "Journal of Parallel and Distributed Computing", volume = "28", number = "2", pages = "173--185", month = aug, year = "1995", keywords = "Boolean algebra; Boolean predicates; causality-preserving order; classes; debugging; distributed algorithms; distributed applications; distributed computations; distributed systems; dynamic property specification; dynamic property verification; dynamic reconfiguration; formal; global predicate; global system states; interval-constrained sequences; program; program debugging; program testing; simple sequences; specification; verification", annote = "The authors investigate the specification and detection of a new class of dynamic properties: these are simple sequences (causality preserving sequences of global states) and interval-constrained sequences (simple sequences with undesired states in the middle). They give algorithms that efficiently detect these predicates based on the usual construction algorithms of the lattice of global states \cite{Cooper:1991:CDG}. The paper contains a good analysis of the inherent costs of constructing the lattice and detecting the predicates and relates their (and others') methods to temporal logics. The discussion section argues that increases expressive power of the observable predicates will always result in an increased cost of detecting it, however, that the worst case analysis is not very realistic since the communication patterns of for example programs using RPC result in very lean lattices." } @Article{Birman:1995:RTC, author = {Kenneth P. Birman and Bradford B. Glade}, title = {Reliability through consistency}, journal = j-IEEE-SOFTWARE, year = {1995}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, month = {May}, pages = {29--41}, OPTnote = {}, annote = "This paper argues that consistency is a key to fault tolerant applications. In particular, consistent failure reporting is important. Different levels of consistency are defined (stabilization consistency [i.e. the system stabilizes to a consistent state], piecewise consistency [i.e. causal consistency], and uniform consistency [i.e. atomic consistency]). Current systems (such as Unix, Chorus, Windows NT, DCE and CORBA, Mach, ISIS and others) are assessed for their consistency guarantees. Implementation difficulties are discussed." } @Book{Bishop:1995:NNP, author = {Ch. M. Bishop}, ALTeditor = {}, title = {Neural Networks for Pattern Recognition}, publisher = {Clarendon-Press}, year = {1995}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Oxford}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {[Angabe von Armin]} } @TechReport{Chandra:1995:UFD, author = "Tushar Chandra and Sam Toueg", title = "Unreliable Failure Detectors for Reliable Distributed Systems", number = "TR95-1535", year = "1995", month = aug, institution = "Cornell University, Computer Science Department", pages = "51", abstract = "We introduce the concept of unreliable failure detectors and study how they can be used to solve Consensus in asynchronous systems with crash failures. We characterise unreliable failure detectors in terms of two properties --- completeness and accuracy. We show that Consensus can be solved even with unreliable failure detectors that make an infinite number of mistakes, and determine which ones can be used to solve Consensus despite any number of crashes, and which ones require a majority of correct processes. We prove that Consensus and Atomic Broadcast are reducible to each other in asynchronous systems with crash failures; thus the above results also apply to Atomic Broadcast. A companion paper shows that one of the failure detectors introduced here is the weakest failure detector for solving Consensus [CHT92].", } @Article{Charron-Bost:1995:LTP, author = "Bernadette Charron-Bost and Carole Delporte-Gallet and Hugues Fauconnier", title = "Local and temporal predicates in distributed systems", OPTcrossref = "", OPTkey = "", journal = j-TOPLAS, year = "1995", volume = "17", number = "1", pages = "157--179", month = jan, OPTnote = "", annote = "This is an intrinsic paper combining general knowledge theory and predicate detection in distributed systems. The authors re-visit Cooper and Marzullo's \cite{Cooper:1991:CDG} predicate transformers `possibly' and `definitely', investigate their properties and show how they relate to the predicate transformer `process p knows phi'. The also define the important notion of a predicate being local to some process set (i.e. the truth value depends only on the local states of that set) and show that knowledge is local (i.e. local predicates are knowledge predicates and vice versa). The results show an interesting analogy between knowledge predicates (which are local and thus ``spatial'') and the temporal predicates `possibly' and `definitely'. Also, a special type of predicates (called `observer independent') is investigated which are easily detectable: observer independent predicates are such for which possibly and definitely coincide. They show that a predicate which is local to one process is observer independent, as well as the disjunction of observer independent predicates. Interestingly, these results show that ``a process never forgets''. Overall, this is a very formal, but nevertheless rewarding article offering some surprising insights, but a little lengthy missing a few ``real-world'' examples (see the article by Haplern and Moses \cite{Halpern:1990:KCK} for one with lots of examples)." } @InProceedings{Chase:1995:EDR, title = "Efficient Detection of Restricted Classes of Global Predicates", author = "Craig M. Chase and Vijay K. Garg", booktitle = pro-wdag95, editor = "Jean-Michel H{\'e}lary and Michel Raynal", address = "Le Mont-Saint-Michel, France", month = sep, year = "1995", series = ser-LNCS, volume = "972", publisher = pub-SV, ISBN = "ISBN 3-540-60274-7", pages = "303--317", annote = "[to read]" } @Article{Cristian:1995:ABF, title = "Atomic Broadcast: From Simple Message Diffusion to {Byzantine} Agreement", author = "Flaviu Cristian and Houtan Aghili and Ray Strong and Danny Dolev", pages = "158--179", journal = "Information and Computation", month = apr, year = "1995", volume = "118", number = "1", annote = "The authors present three timed atomic broadcast algorithms with increasing fault tolerance properties: (1) timestamped message diffusion based on flooding, tolerant against a limited number of crash/omission failures; (2) timestamped message diffusion with hop count, tolerant against timing failures; (3) timestamped message diffusion with hop count and authentication, tolerant against authentication detectable Byzantine failures. All protocols provide timely dissemination up to network partition. Two lower bounds prove that (1) the time needed for atomic broadcast to terminate in a network of diameter $x$ is $O(x)$ (limited number of crash/omission failures, network stays connected); (2) any atomic broadcast protocol with $n$ processors that tolerates $n-2$ authentication detectable Byzantine processor failures cannot have a termination time smaller than $(n-1)\cdot\delta$. Conclusions contain references to other work on atomic broadcast and shows the alternative between diffusion based and acknowledgement based protocols. The authors also argue that bounded reaction time is incompatible with partitions. The derivational presentation of the algorithms reminds of \cite{Hadzilacos:1994:MAF} and is very rewarding." } @Article{Dolev:1995:DFC, author = {Danny Dolev and Joseph Y. Halpern and Barbara Simons and Ray Strong}, title = {Dynamic Fault-Tolerant Clock Synchronization}, journal = J-ACM, year = {1995}, OPTkey = {}, volume = {42}, number = {1}, pages = {143--185}, month = jan, OPTnote = {}, annote = {Proposes a new algorithm for clock synchronization. First gives a good overview over other algorithms: mostly they are averaging methods reqiring $3f+1$ nodes or $2f+1$ if authentication is available. There are also phase locking algorithms, where nodes periodically broadcast their time and others set their clock to that time. Assumptions are bounded drift rate between local hardware clocks, and an upper bound on message transmission time. A tolerance specification of linear envelope synchronization is given on p. 150. The algorithm they give is late extended to also handle processor joins, it can tolerate any number of faults provided the correct processes stay connected. Overall a very rigorous paper, gives a good impression of clock synchronization up to today. } } @InProceedings{Dolev:1995:SCS, author = "Shlomi Dolev and Jennifer L. Welch", title = "Self-stabilizing clock synchronization in the presence of {Byzantine} faults", booktitle = "Proceedings of the Second Workshop on Self-Stabilizing Systems", pages = "9.1--9.12", year = "1995", annote = "It is known that clock synchronization in Byzantine environments requires $3f+1$ processors if $f$ is the number of faulty processors. Protocols exist for these cases. In this paper the authors investigate the problem under a more severe failure assumption: apart from $f$ processors being faulty, any form of transient faults may happen to the system. They present two probabilistic protocols that synchronize clocks in a system under these assumptions. In effect, these protocols are self-stabilizing. The protocols cause the local clocks to converge into a given margin within time exponential to the total number of processes. Because they investigate arbitrary transient faults, they also use bounded clocks that wrap around periodically. They also present an interesting application of the Chinese Remainder Theorem for implementing a distributed counter." } @InCollection{Echtle:1995:TFT, author = "Klaus Echtle and Martin Leu", title = "Test of fault tolerant distributed systems by fault injection", OPTcrossref = "", OPTkey = "", booktitle = "Fault-Tolerant Parallel and Distributed Systems", publisher = pub-IEEE, year = "1995", editor = "D. Pradhan and D. Avresky", OPTvolume = "", OPTnumber = "", OPTseries = "", OPTtype = "", OPTchapter = "", pages = "244--251", OPTaddress = "", OPTedition = "", OPTmonth = "", OPTnote = "", annote = "This paper presents a method to efficiently produce test cases for fault injection to test fault tolerant algorithms for design faults. Usually, the number of test cases is very large, because every branch and all paths of a program must be explored. Here, the number of test cases is reduced by two methods: (1) instead of analyzing the full program, an abstraction of it is considered. The abstraction is modeled by a timed Petri net and omits the description of nodes assumed to be faulty. (2) Test cases are generated from this Petri net by constructing the reachability graph and semiautomatically cutting off paths that are semantically unjustified (because for example timing assumptions violate the given failure model). The resulting test cases are in a sense ``complete'' and significantly less than brute force approaches yield. It is interesting how the behavior of faulty nodes is modeled on the abstraction level: nothing can be assumed about their behavior, resembling Byzantine behavior. The test cases can subsequently be used to test the implemented system and spare the developer from tedious full verification of the algorithm with all its low-level details. The work in this paper is related to ground-breaking work of Echtle in 1984 \cite{Echtle:1984:FSV}." } @InProceedings{Fetzer:1995:PCA, author = "Christof Fetzer and Flaviu Cristian", title = "On the possibility of consensus in asynchronous systems", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", OPTpages = "", booktitle = "Proceedings of the 1995 Pacific Rim International Symposium on Fault-Tolerant Systems", year = "1995", OPTorganization = "", OPTpublisher = "", OPTaddress = "", month = dec, OPTnote = "", annote = "The authors show that consensus is possible in the timed asynchronous system model together with the ``always eventually majority-stable progress assumption''. They argue that this adequately reflects todays networked workstations, i.e., that todays networks are not completely asynchronous. The work is related to other work that adds synchrony to the time free model, claims to be closest to \cite{Dwork:1988:CPP} (the ``global stabilization model'') and does not relate in depth to \cite{Chandra:1992:WFD} or \cite{Chandra:1991:UFD} because ``the model considered there is time-free, [it] assumes that properties of failure detectors eventually always hold, and [it] does not include processor restarts." } @InProceedings{Guerraoui:1995:NBA, author = "Rachid Guerraoui and Mikel Larrea and {Andr\'e} Schiper", title = "Non blocking atomic commitment with an unreliable failure detector", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", OPTpages = "", booktitle = pro-srds95, year = "1995", OPTorganization = "", OPTpublisher = "", OPTaddress = "", month = sep, OPTnote = "", annote = "The authors present a solution to the non-blocking atomic commitment problem in asynchronous systems using failure detectors. A commit protocol is a consensus protocol with favour of abort, and it is non-blocking meaning that only all surviving members need to commit. The authors adapt Chandra and Toueg's consensus algorithm \cite{Chandra:1996:UFD} to solve atomic commitment. Necessary prerequisites for termination are therefore eventually weak failure detectors and a majority of correct processes." } @InProceedings{Guerraoui:1995:RRB, title = "Revisiting the Relationship Between Non-Blocking Atomic Commitment and Consensus", author = "Rachid Guerraoui", booktitle = pro-wdag95, editor = "Jean-Michel H{\'e}lary and Michel Raynal", address = "Le Mont-Saint-Michel, France", month = "13--15~" # sep, year = "1995", series = "Lecture Notes in Computer Science", volume = "972", publisher = pub-SV, ISBN = "ISBN 3-540-60274-7", pages = "87--100", annote = "[to read]" } @InCollection{Guerraoui:1995:TMV, author = "Rachid Guerraoui and {Andr\'e} Schiper", title = "Transaction model vs virtual synchrony model: bridging the gap", OPTkey = "", booktitle = "Theory and Practice in Distributed Systems", publisher = pub-SV, year = "1995", editor = "K. P. Birman and F. Mattern and A. Schiper", OPTvolume = "", number = "938", series = ser-LNCS, OPTtype = "", OPTchapter = "", pages = "121--131", OPTaddress = "", OPTedition = "", OPTmonth = "", OPTnote = "", annote = "[to read]" } @INPROCEEDINGS{Gouda:1995:TTS, AUTHOR = "Mohamed G. Gouda", TITLE = "The triumph and tribulation of system stabilization", BOOKTITLE = pro-wdag95, YEAR = 1995, PAGES = "1--18", annote = "reviews 10 years of stabilization research." } @Article{Halpern:1995:RAK, author = "Joseph Y. Halpern", title = "Reasoning about Knowledge: {A} Survey", editor = "D. M. Gabbay and C. J. Hogger and J. A. Robinson", booktitle = "Handbook of Logic in Artificial Intelligence and Logic Programming, Volume 4: Epistemic and Temporal Reasoning", pages = "1--34", publisher = "Oxford University Press", year = "1995", annote = "[to read]" } @InProceedings{Isermann:1995:OFL, author = {Rolf Isermann}, title = {On Fuzzy Logic Applications for Automatic Control, Supervision and Fault Diagnosis}, booktitle = {Proceedings of the Third European Congress on Intelligent Techniques and Soft Computing (EU-FIT)}, OPTcrossref = {}, OPTkey = {}, pages = {738--753}, year = {1995}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Aachen}, OPTmonth = {}, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {[Angabe von Armin]} } @InProceedings{Jegou:1995:LSA, author = {Roland {J\'egou} and Raoul Medina and Lhouari Nourine}, title = {Linear space algorithm for on-line detection of global predicates}, booktitle = {Proceedings of the International Workshop on Structures in Concurrency Theory (STRICT)}, OPTcrossref = {}, OPTkey = {}, pages = {175--189}, year = {1995}, editor = {{J\"org} Desel}, OPTvolume = {}, OPTnumber = {}, series = {Workshops in Computing}, OPTaddress = {}, OPTmonth = {}, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {} } @InProceedings{Julier:1995:NAF, author = {Simon J. Julier and Jeffrey K. Uhlmann and Hugh F. Durrant-Whyte}, title = {A new approach for filtering nonlinear systems}, booktitle = {Proceedings of the 1995 American Control Conference}, OPTcrossref = {}, OPTkey = {}, pages = {1628--1632}, year = {1995}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Seattle, WA}, OPTmonth = {}, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {Presents a method to replace the extended Kalman filter \cite{Welch:1995:IKF} by some tricky prediction mechanism that doesn't require to calculate Jacobian matrices.} } @Misc{Ladkin:1995:340, author = {Peter Ladkin}, title = {Re: A340 incident at {Heathrow} {(Hatton, RISKS-16.92)}}, howpublished = {The Risks Digest (Forum on Risks to the Public in Computers and Related Systems)}, month = mar, volume = 16, number = 96, year = {1995}, url = "\url{http://catless.ncl.ac.uk/Risks}", OPTnote = {}, annote = {Describes in detail the Airbus A340 incident at Heathrow in September 1994. During the approach, both display screens in the cockpit went blank and displayed a message ``please wait''. The pilots were still able to fly the plane, but it's somewhat difficult without instrument feedback. The autopilot, which was subsequently switched on, tuned into a ``false glidescope'', a side effect of the radio beam used for landing the aircraft under instrument conditions. This caused the aircraft to fly unusually high pitch rates. The pilots subsequently turned off the autopilot and used a SRA (surveillance radar approach) where the plane is ``talked'' down by the tower. They landed safely. Later the logs of the computer system showed that there had been near-to simultaneous faults in the two redundant flight control systems leading to unexpected behavior (for example, the system also wrongly complained that it was low on fuel). Airbus Industries is said to be aware that there are problems within the redundancy management and that the failure of one computer can cause a failure in the next.} } @Article{Lamport:1995:HWP, author = "Leslie Lamport", title = "How to write a proof", journal = "American Mathematical Monthly", volume = "102", number = "7", pages = "600--608", month = aug # "\slash " # sep, year = "1995", url = "http://www.research.digital.com/SRC/personal/Leslie_Lamport/proofs/proofs.html", annote = "A way of writing proofs is presented that ``makes it much harder to prove things that are not true''. It is a structured proof writing method similar to proof trees of interactive theorem proving environments. The exposition and experience reports with this method are delightful. Prior version appeared as DEC SRC Research Report number 94" } @InProceedings{LeLann:1995:ORN, author = "Gerard {Le Lann}", title = "On real-time and non real-time distributed computing", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "972", OPTseries = ser-LNCS, pages = "51--70", booktitle = pro-wdag95, year = "1995", OPTorganization = "", OPTpublisher = pub-SV, OPTaddress = "", month = sep, OPTnote = "Invited paper.", annote = "The author explores the relationship between problems in real-time and non real-time computing. The distinction between both classes is that real-time problems have a set of timeliness constraints and their model has additional restrictions on event releases. Timeliness constraints are considered to be a composition of safety and liveness properties. Two examples are discussed: the asynchronous consensus problem (for non real-time) and the hard real-time distributed multiaccess channel problem. Both presentations, especially that of the second example, are intricate and tedious to understand. Finally, the author identifies that timeliness constraints are related to on-line scheduling strategies in the sense that solutions in an asynchronous model may be ``immersed'' into real-time environments by adding special scheduling algorithms. This corresponds to a distinction between design and implementation phases. The paper contains a reference to the distinct phases of diffusion and decision in asynchronous consensus and relates them to knowledge theoretic terms such as partial common knowledge. It also discusses real-time equivalents of an eventually weak failure detector. Overall, a paper with lots of ideas, many typos and typographical shortcomings (obviously hastily produced) and lots of passages which I do not grasp." } @InCollection{Liu:1995:FFF, author = {Zhiming Liu and Mathai Joseph}, title = {A formal framework for fault-tolerant programs}, booktitle = {Mathematics of Dependable Computing}, OPTcrossref = {}, OPTkey = {}, pages = {131--148}, publisher = {Oxford University Press}, year = {1995}, editor = {C. M. Mitchell and V. Stavridou}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTtype = {}, OPTchapter = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {This is a book chapter incorporating ideas and results from a few other papers by the same authors. The beginning is the same as in \cite{Liu:1996:VFR}: program development is described as a sequence of refinement steps; action systems, TLA, specifications and the rest of the formalism is introduced. In contrast to \cite{Liu:1996:VFR} here also liveness properties are studied. Liveness properties result from imposing some fairness condition on the specification. Refinement mappings are defined along the lines of \cite{Lamport:1989:SAS}. Then faults and their effects are studied (as in \cite{Liu:1996:VFR}): physical faults are modeled as a set of actions which are scheduled concurrently with regular program actions, i.e. faults are isolated/separated from the program. Then the fault-tolerant refinement relation is discussed (like in \cite{Liu:1996:VFR}) and the distinction between global and local fault assumption (terms are attributed to Nordahl's thesis \cite{Nordahl:1992:SDD}). Global fault assumptions are always safety properties while local fault assumptions are safety and liveness properties (specified by state transitions). It is shown (as in \cite{Liu:1996:VFR}) that the global fault assumptions may be integrated into the fault actions: the specification of the fault-affected program is the conjunction of an (1) initial property, (2) the state transitions of the program and the faults, (3) the fault assumption and (4) the fairness property. The safety properties (2) and (3) can be encoded in a new state transition relation and thus are ``locally programmable''. Yes, separating local from global fault assumptions makes it easier to specify fault affected behaviors. But before proving the fault tolerance, the global assumption should be integrated into the transition system. Overall this is a version of \cite{Liu:1996:VFR} using the same examples but discussing liveness issues and not touching real-time. } } @Book{Manna:1995:TVR, author = {Zohar Manna and Amir Pnueli}, title = {Temporal verification of reactive systems: safety}, publisher = pub-SV, year = {1995}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {See also \cite{Manna:1991:TLR}. Where's liveness?} } @InCollection{Mattern:1995:NLI, author = "Friedemann Mattern and Stefan {F\"unfrocken}", title = "A non-blocking lightweight implementation of causal order message delivery", OPTcrossref = "", OPTkey = "", booktitle = "Theory and Practice in Distributed Systems", publisher = pub-SV, year = "1995", editor = "K. P. Birman and F. Mattern and A. Schiper", OPTvolume = "", number = "938", series = ser-LNCS, OPTtype = "", OPTchapter = "", pages = "197--213", OPTaddress = "", OPTedition = "", OPTmonth = "", OPTnote = "", annote = "An implementation of causal order delivery using low level FIFO buffers. Excludes some computations but is very efficient." } @Book{Neumann:1995:CRR, author = {Peter G. Neumann}, title = {Computer Related Risks}, publisher = {ACM Press}, year = {1995}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {A great collection of computer related incidents from many areas (defense, space, aviation, etc.) affecting reliability, safety and security together with cause analysis and a discussion about technical and social countermeasures. A good source of information and also a starting point for more because of the good references (especially the RISKS forum).} } @TechReport{Sabel:1995:EVC, author = "Laura S. Sabel and Keith Marzullo", title = "Election Vs. Consensus in Asynchronous Systems", number = "TR95-1488", year = "1995", month = feb, institution = "Cornell University, Computer Science Department", pages = "9", abstract = "It was shown in 1985 that the {\em Consensus problem} cannot be solved in an asynchronous system if even a single crash failure can occur. In this paper, we show that there are other problems that cannot be solved in an asynchronous system, and for the same intuitive reason: it is impossible to distinguish a very slow processor from a crashed processor. However, these problems are harder than Consensus, in that there are contexts in which Consensus can be solved but these other problems cannot. More precisely, the weakest failure detector that is needed to solve these problems is a Perfect Failure Detector, which is strictly stronger than the weakest failure detector that is needed to solve Consensus. We use a formulation of the Election problem as the prototype for these problems that are harder than Consensus.", annote = "Contains a good and concise definition of failure detectors \`a la Chandra and Toueg \cite{Chandra:1996:UFD} in terms of temporal logic. The proof idea is as follows: a failure detector has very weak completeness iff eventually every process that crashes is suspected at least once by some correct process. The authors then show that (1) strong accuracy and very weak completeness are necessary to solve election, and (2) that both together are sufficient. This shows that a strongly complete and very weakly accurate failure detector is the weakest failure detector necessary for election. Very weak completeness and strong accuracy however suffice to implement a perfect failure detector. Thus the weakest failure detector for election is stronger than the weakest failure detector for consensus. Thus, election is harder than consensus. Other problems as hard as election are primary backup and (probably) terminating reliable broadcast. " } @Article{Singhal:1995:OPA, author = "Mukesh Singhal and Friedemann Mattern", title = "An optimality proof for asynchronous recovery algorithms in distributed systems", journal = j-IPL, volume = "55", number = "3", pages = "117--121", day = "11", month = aug, year = "1995", keywords = "Algorithms; asynchronous recovery; Asynchronous recovery algorithms; Computation theory; Computer networks; Computer simulation; Computer system recovery; Consistent cut; consistent cut; Data communication systems; Data processing; Distributed computer systems; distributed processing; distributed systems; Internal events; Message receive events; Message send events; Optimality proof; optimality proof; roll backs; system recovery", treatment = "T Theoretical or Mathematical", annote = "[to read]" } @InProceedings{Stoller:1995:FPD, title = "Faster Possibility Detection by Combining Two Approaches", author = "Scott D. Stoller and Fred B. Schneider", booktitle = pro-wdag95, OPTeditor = "Jean-Michel H{\'e}lary and Michel Raynal", month = sep, year = "1995", OPTseries = ser-LNCS, OPTvolume = "972", OPTpublisher = pub-SV, OPTISBN = "ISBN 3-540-60274-7", pages = "318--332", annote = "The main contribution of this paper is the best in-depth investigation of the complexity of possibility detection so far. The general algorithms by Cooper an Marzullo \cite{Cooper:1991:CDG} have worst case time complexity of $\Omega(S^N)$ where $N$ is the number of processes and $S$ is the maximum number of relevant events on every process. This is because \emph{every} consistent global state has to be investigated. However, as shown for example by Garg and Waldecker \cite{Garg:1994:DWU}, one can do better for restricted types of predicates. In this paper, the authors show an interesting decomposition property of the set of global consistent states and an application to possibility detection: a state $g$ is globally consistent iff for any subset $F$ of processes (1) the restriction of $g$ to $F$ is concurrent to the restriction of $g$ to the complement of $F$, and (2) the restriction of $g$ to $F$ is a consistent global state in the computation restricted to $F$, and (3) the restriction of $g$ to the complement of $F$ is a consistent global state in the computation restricted to the complement of $F$. The idea now is to reformulate the detection predicate and to specialize it with respect to some subset $F$ of processes. Then, possibly(P) is equivalent to choosing a set $F$ of processes, choosing a constistent global state $g$ of the computation restricted to $F$, and checking whether possibly(P') holds in the computation restricted to the complement of $F$, where P' denotes the predicate P specialized to $g$. (Uff!) Now, having such a fixed set $F$, a standard algorithm for possibility detection can be run in ``smaller'' computations, but this has to be done as many times as the computation restricted to $F$ has consistent global states. So, the complexity of the resulting algorithm depends on $|F|$ and is $O(S^{|F|+1})$ which is better than usual whenever $|F|<N-1$. However, finding a minimal fixed set is shown to be NP-complete and so only approximations help in general (there is some work to be done here). The authors additionally show that amoung all formulas equivalent to $P$, the disjunctive normal form (DNF) has minimum cost for possiblity detection (every disjunct can be detected seperately). A few enhancements are discussed, example applications are given and some funny matrix multiplication method is presented for off-line possibility detection. Finally, a well-written section discusses the inherent complexity of detecting possibly and gives some good references. Overall, this is a paper that at some points supersedes my own abstraction bounds and swims in a theoretical ocean which is very wide." } @Article{Verissimo:1995:QSS, author = {Paulo Ver{\'\i}ssimo and Carlos Almeida}, title = {Quasi-synchronism: a step away from the traditional fault-tolerant real-time system models}, journal = {Bulletin of the Technical Committee on Operating Systems and Application Environments (TCOS)}, year = {1995}, OPTkey = {}, volume = {7}, number = {4}, pages = {35--39}, OPTmonth = {}, OPTnote = {}, annote = {The ideas herein appear in a more general and elaborate form in \cite{Almeida:1998:QSA}.} } @TechReport{Welch:1995:IKF, author = {Greg Welch and Gary Bishop}, title = {An Introduction to the {Kalman} filter}, institution = {University of North Carolina at Chapel Hill, Department of Computer Science}, year = {1995}, OPTkey = {}, OPTtype = {}, number = {TR 95-041}, address = {Chapel Hill, NC 27599-3175}, OPTmonth = {}, OPTnote = {}, annote = {This paper provides an introduction to the concept of a Kalman filter for the non-expert. A Kalman filter can be used to estimate the state of a discrete linear process in noisy environments. If the process is non-linear, an extended Kalman filter is used that assumes linearity on intervals of process behavior. The extended Kalman filter requires calculating the Jacobian matrix of derivates of the process modeling function. A new approach to filtering nonlinear systems that does not require calculating these matrices is described in \cite{Julier:1995:NAF}.} } @Article{Wiederhold:1995:MIS, author = "Gio Wiederhold", title = "Mediation in Information Systems", journal = j-ACM-COMP-SURVEYS, volume = "27", number = "2", pages = "265--267", month = jun, year = "1995", url = "http://www.acm.org/pubs/toc/Abstracts/0360-0300/210390.html", annote = "discusses sensor/actuator approach [to read]" } @InProceedings{Zhou:1995:FNP, author = "Jianying Zhou and Dieter Gollmann", title = "A Fair Non-repudiation Protocol", keywords = "non-repudiation, trusted third party", pages = "55--61", year = "1996", booktitle = "Proceedings of the IEEE Symposium on Security and Privacy", address = "Oakland, CA", year = "1996", publisher = pub-IEEE, month = may, organization = "IEEE Computer Society,Technical Committee on Security and Privacy", series = "Research in Security and Privacy", annote ="something like active exchange \cite{Buerk:1990:VES}, [to get]" } @InProceedings{Aguilera:1996:RFD, title = "Randomization and Failure Detection: {A} Hybrid Approach to Solve Consensus", author = "Marcos Kawazoe Aguilera and Sam Toueg", booktitle = pro-wdag96, editor = "{\"O}zalp Babaoglu and Keith Marzullo", address = "Bologna, Italy", month = "9--11~" # oct, year = "1996", series = "Lecture Notes in Computer Science", volume = "1151", publisher = pub-SV, ISBN = "ISBN 3-540-61769-8", pages = "29--39", annote = "[to read]" } @InProceedings{Almeida:1996:TFD, author = {Carlos Almeida and Paulo Ver{\'\i}ssimo}, title = {Timing Failure Detection and Real-Time Group Communication in Quasi-Synchronous Systems }, booktitle = {Proceedings of the 8th Euromicro Workshop on Real-Time Systems}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {1996}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {L'Aquila, Italy}, month = jun, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {A newer and more elaborate treatment of this topis can be found in \cite{Almeida:1998:QSA}.} } @Article{Arora:1996:CSB, author = "Anish Arora and Mohamed G. Gouda and George Varghese", title = "Constraint satisfaction as a basis for designing nonmasking fault-tolerant systems", OPTcrossref = "Arora:1994:CSB", OPTkey = "", journal = "Journal of High Speed Networks", year = "1996", volume = "5", number = "3", pages = "293--306", OPTmonth = "", OPTnote = "A preliminary version appeared at ICDCS94.", annote = "Probably the same as \cite{Arora:1994:CSB} but more citeable." } @InProceedings{Ayache:1996:FMV, author = "S. Ayache and E. Conquet and P. Humbert and C. Rodriguez and J. Sifakis and R. Gerlich", title = "Formal Methods for the Validation of Fault Tolerance in Autonomous Spacecraft", pages = "353--359", ISBN = "0-8186-7261-7", booktitle = pro-ftcs96, month = jun # "25--27~", publisher = "IEEE", address = "Washington", year = "1996", annote = "[to read]" } @Article{Babaoglu:1996:UFS, author = "{\"Ozalp} {Babao\u{g}lu} and Eddy Fromentin and Michel Raynal", title = "A unified framework for the specification and run-time detection of dynamic properties in distributed computations", OPTcrossref = "", OPTkey = "", journal = "Journal of Systems and Software", year = "1996", volume = "33", OPTnumber = "", pages = "287--298", OPTmonth = "", OPTnote = "", annote = "The authors present a general framework with which to detect a large class of properties of distributed computations. Abstractly, property detection can be seen as searching through an evolving directed acyclic graph (DAG) which labelled nodes. If the nodes carry labels according to specific properties, the detection problem can be formulated as an instance of the language recognition problem. This counts for all properties that are expressible as regular languages. The framework can be used to detect properties of computations based on sequences of local states (control flows). It can also be used to detect properties defined on sequences of consistent global states. Thus, the method is a generalization of the property detection approaches of Cooper and Marzullo \cite{Cooper:1991:CDG} for possibly and definitely \cite{Babaoglu:1993:CGS,Schwarz:1994:DCR}. The detection methods are based on mapping an accepting automaton onto the nodes. For properties of control flow it suffices to add an array of bits (of the order of the set of states of the accepting automaton) to every node and message and have a distributed controller running and updating the array at every node. If the larger class of properties on sequences of global states is to be detected, the authors employ a central monitoring process (like in \cite{Cooper:1991:CDG}) that incrementally constructs the lattice of consistent states. The nodes of the lattice are an array of bits (one for every state of the accepting automaton). While the previous approach is practically feasible and implemented (in the EREBUS distributed debugger process mentioned in the acticle), the detection of sequences of global states seems to be infeasible. However, properties on single global states (like possibly and definitely) do not need the expressibility of regular languages and some detecting these sorts of predicates can be feasible in practice. The authors see their method as a sort of on-the-fly model checking that have no idea of the model they are checking against. Overall, I like this paper very much: it is concise and mathematically sound, uses a minimal set of examples ans strives for theoretical excellence." } @Book{Barbosa:1996:IDA, author = "Valmir C. Barbosa", title = "An Introduction to Distributed Algorithms", publisher = "MIT Press", address = "Cambridge, MA", year = "1996", keywords = "book, text, parallel processing, supercomputers, computer algorithms,", } @TechReport{Basu:1996:SPP, title = "Solving Problems in the Presence of Process Crashes and Lossy Links", author = "Anindya Basu and Bernadette Charron-Bost and Sam Toueg", year = "1996", month = sep, pages = "30", institution = "Cornell University, Computer Science Department", number = "TR96-1609", abstract = "We study the effect of link failures on the solvability of problems in asynchronous systems that are subject to process crashes: given a problem that can be solved in a system with process crashes and reliable links, is the problem solvable even if links are lossy? We answer this question for two types of lossy links, and show that the answer depends on the maximum number of processes that may crash and the nature of the problem to be solved. In particular, we prove that the answer is positive if fewer than half of the processes may crash or if the problem specification does not refer to the state of processes that crash. However, in general, the answer is negative even if each link can loose only a finite number of messages.", annote = "A shorter version appeared at WDAG-10 \cite{Basu:1996:SRL} which is summarized there and not in this bibliographic entry." } @InProceedings{Basu:1996:SRL, author = "Anindya Basu and Bernadette Charron-Bost and Sam Toueg", title = "Simulating reliable links with unreliable links in the presence of process crashes", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "105--122", booktitle = pro-wdag96, year = "1996", OPTorganization = "", publisher = pub-SV, address = "Bologna, Italy", month = oct, OPTnote = "", annote = "The authors investigate the question, what problems that are solvable with reliable links and possible process crashes remain solvable in the presence of unreliable links. They investigate two types of unreliable links: eventually reliable (there is a time after which the link becomes reliable = finite message loss), and fair lossy (if an infinite number of messages is sent over a channel, then an infinite number of messages is received at the other end = infinite message loss). Intuitively, a reliable link is also eventually reliable, and an eventually reliable link is also fair lossy. The authors show two things: (1) there are problems (e.g. uniform reliable broadcast) that are solvable with reliable channels but are not solvable with eventually reliable channels. This means that, in general, eventually reliable links cannot simulate reliable links. (2) if the majority of processes in the system is correct, then fair lossy links can simulate reliable links. The key idea behind this fact is that processes must infinitely often diffuse their message histories. This is however very inefficient (requires unbounded storage capacity in nodes and unbounded message length). In general, this is a paper which reveals again the importance of correct-restricted problems (problems in which only correct processes are required to do something), because correct-restricted problems remain solvable even with fair lossy links." } @InProceedings{Beauquier:1996:MFH, author = {Joffroy Beauquier and {Synn\"ove} Kekkonen}, title = {Making {FTSS} is hard}, booktitle = {Proceedings of the International Conference on Software Engineering (ICSE'96)}, OPTcrossref = {}, OPTkey = {}, pages = {91--96}, year = {1996}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Las Vegas, USA}, month = jul, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {Possibly similar to brief announcement \cite{Beauquier:1997:OFS}. See also \cite{Beauquier:1997:FTS,Kekkonen:1998:RFA}.} } @InProceedings{Camp:1996:AAT, author = {Jean Camp and Micheal Harkavy and J. D. Tygar and Bennet Yee}, title = {Anonymous atomic transactions}, booktitle = {Proceedings of the 2nd USENIX Workshop on Electronic Commerce}, OPTcrossref = {}, OPTkey = {}, pages = {123--133}, year = {1996}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, month = nov, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {[to write]} } @InProceedings{Chandra:1996:IGM, author = "Tushar Deepak Chandra and Vassos Hadzilacos and Sam Toueg and Bernadette Charron-Bost", title = "On the Impossibility of Group Membership", pages = "322--330", booktitle = pro-podc96, ISBN = "0-89791-800-2", month = may, publisher = "ACM", address = "New York, USA", year = "1996", OPTnote = "Also published as Technical Report TR95-1548, Cornell University.", annote = "The authors show, that the problem of weak group membership (WGM) is impossible in the asynchronous system model used by Fischer et. al. in their famous impossibility proof of consensus \cite{Fischer:1985:IDC}. WGM is defined having two properties: (liveness) if processes want to leave the group, at least one other process must install a new view of the group and no process installs a view different from it; (safety) it must be possible that the new view installed is correct. Impossibility of WGM is especially noteworthy because it is at the core of many group communication systems (e.g. Isis and Transis). That's what makes this paper worthwile citing." } @Article{Chandra:1996:UFD, author = "Tushar Deepak Chandra and Sam Toueg", title = "Unreliable failure detectors for reliable distributed systems", OPTcrossref = "", OPTkey = "", journal = j-ACM, year = "1996", volume = "43", number = "2", pages = "225--267", month = mar, OPTnote = "", annote = "Journal version of \cite{Chandra:1991:UFD}." } @Article{Chandra:1996:WFD, author = "Tushar Deepak Chandra and Vassos Hadzilacos and Sam Toueg", title = "The weakest failure detector for solving consensus", OPTcrossref = "", OPTkey = "", journal = j-ACM, year = "1996", volume = "43", number = "4", pages = "685--722", month = jul, OPTnote = "", annote = "Journal version of \cite{Chandra:1992:WFD}." } @InProceedings{Charpentier:1996:ACR, author = "Michel Charpentier and Mamoun Filali and Philippe Mauran and {G\'erard} Padiou and Philippe {Qu\'einnec}", title = "Abstracting communication to reason about distributed algorithms", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "89--103", booktitle = pro-wdag96, year = "1996", OPTorganization = "", OPTpublisher = "", OPTaddress = "", OPTmonth = "", OPTnote = "", annote = "Concurrent programming formalisms like UNITY are often based on locally shared variables as communication primitives. While it is possible to extend these models via definitions with message passing primitives this is quite cumbersome. In this paper the authors propose an abstraction of communication based on observations: variable $x$ observes $y$ if updates of $x$ reflect all the updates of $y$ but not necessarily in a tiemly manner, i.e., eventually $x$ will take on all values of $y$ in the original order. Using this observation relation on variables, the authors present inference rules for the UNITY framework that can be used to prove that distributed algorithms have certain properties. The observation relation is interesting because it abstracts from communication and a communication topology and thus acts like a transport layer of communication subsystems. No relations to knowledge based protocol formalisms are discussed, although they seem to have fundamental similarities." } @Article{Charron-Bost:1996:SAC, author = {Bernadette Charron-Bost and Friedemann Mattern and Gerard Tel}, title = {Synchronous, asynchronous, and causally ordered communication}, journal = j-DC, year = 1996, volume = 9, pages = "173--191", OPTannote = {} } @InProceedings{Cristian:1996:GMS, author = "Flaviu Cristian", title = "Group, Majority, and Strict Agreement in Timed Asynchronous Distributed Systems", pages = "178--189", ISBN = "0-8186-7261-7", booktitle = "Proceedings of the Twenty-Sixth International Symposium on Fault-Tolerant Computing", month = jun # "25--27~", publisher = "IEEE", address = "Washington", year = "1996", annote = "[to read]" } @Article{Cristian:1996:SAG, author = "Flaviu Cristian", title = "Synchronous and Asynchronous Group Communication", journal = j-CACM, volume = "39", number = "4", pages = "88--97", month = apr, year = "1996", subject = "{\bf H.5.3}: Information Systems, INFORMATION INTERFACES AND PRESENTATION, Group and Organization Interfaces, Asynchronous interaction. {\bf H.5.3}: Information Systems, INFORMATION INTERFACES AND PRESENTATION, Group and Organization Interfaces, Synchronous interaction.", annote = "[to read]", } @InProceedings{Dega:1996:RMA, author = "Jean-Louis Dega", title = "The redundancy mechanisms of the {Ariane} 5 operational control center", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "382--386", booktitle = pro-ftcs96, year = "1996", OPTorganization = "", OPTpublisher = "", OPTaddress = "", OPTmonth = "", OPTnote = "", annote = "Dega reports on details of the ground control system of the Ariane 5 project. The system is a fully distributed real-time system built on top of an off-the-shelf real-time operating system. It controls the countdown procedure until 3 seconds before the launch. The main system components are duplicated twice (using hot/warm standby) and can be repaired on-line. The main dependability requirement is to be fail-safe, i.e. in case of critical failures the system should stop in a safe state. The probability of a serious event occuring during was aimed to be $10^{-6}$. Main failure detection functions are performed by the components themselves (self-checking). The time constraints to hand over from active to standby machine are less than 300 ms. The design of the system is another example for safety being more important than liveness in practical systems (see \cite{Kreitz:1998:SWL}.) See also the notes on redundancy in a report on current NASA work \cite{Marcopulos:1998:FBC}." } @TechReport{Dolev:1996:FDO, author = "Danny Dolev and Roy Friedman and Idit Keidar and Dahlia Malkhi", title = "Failure detectors in omission failure environments", institution = "Cornell University, Computer Science Department", year = "1996", OPTcrossref = "", OPTkey = "", OPTtype = "", number = "TR96-1608", OPTaddress = "", month = sep, OPTnote = "", annote = "[to read] studies also partitions, surveyed in \cite{Aguilera:1998:FDC}. Published as a brief announcement at PoDC 97 \cite{Dolev:1997:FDO}." } @Misc{ESA:1996:A5F, OPTkey = {}, author = {{European Space Agency}}, title = {ARIANE 5 Flight 501 Failure}, howpublished = {\url{http://www.esrin.esa.it/htdocs/tidc/Press/Press96/ariane5rep.html}}, month = jul, year = {1996}, note = {Report by the Inquiry Board}, OPTannote = {} } @inproceedings{Fetzer:1996:FAT, author = {Christof Fetzer and Flaviu Cristian}, title = {Fail-Awareness in Timed Asynchronous Systems}, booktitle = pro-podc96, year = {1996}, month = {May}, address = {Philadelphia}, pages = {314--321a}, note = {\url{http://www-cse.ucsd.edu/users/cfetzer/FA/fa.html}}, annote = "Shows how to transform a synchronous specification $S$ into a weakened specification $F$ which is implementable in timed-asynchronous systems. A synchronous specification is one which prescribes a real-time deadline for completion of the service. See \cite{Cristian:1999:TAD} for an explanation of the timed asynchronous system model." } @InProceedings{Fetzer:1996:FFD, author = "Christof Fetzer and Flaviu Cristian", title = "Fail-Aware Failure Detectors", pages = "200--209", booktitle = "Proceedings of the 15th Symposium on Reliable Distributed Systems ({SRDS} 1996)", ISBN = "0-8186-7481-4", month = oct, publisher = "IEEE Computer Society Press", address = "Los Alamitos, Ca., USA", year = "1996", annote = "The authors report on the contradiction that election has been proved requiring a perfect failure detector \cite{Sabel:1995:EVC} but election seems implementable in existing asynchronous systems \cite{Fetzer:1995:PCA}. To resolve this contradiction, they introduce a new class of fail-aware failure detectors which together with certain progress assumptions \cite{Fetzer:1995:PCA} are sufficient to solve election and are implementable in timed asynchronous systems \cite{Cristian:1998:TAS}. Fail-aware failure detectors are based on the idea that a process suspects itself immediately if it is suspected by another process (strong fail-awareness) or by a majority of other processes (weak fail awareness). Together with the strong completeness and eventual weak accuracy of Chandra and Toueg \cite{Chandra:1996:UFD} both attributes are sufficient to solve election in asynchronous systems. The failure detectors have infinite output domains and this resemble very much those of \cite{Aguilera:1997:HTF}. The fail-awareness property can be implemented in timed asynchronous systems only when making progress assumptions \cite{Fetzer:1995:PCA} which assume strict synchrony for a ``sufficiently long'' period of time. Also, these are the first failure detectors that actually reference real time in their specifications, which is a little confusing when designing algorithms for the time-free model. However, exact formal definitions in terms of \cite{Chandra:1996:UFD} are given but the full consequences of the definition in terms of whether ``new'' and previously undiscovered features are added are not discussed in depth." } @Book{Gabriel:1996:POS, author = "Richard P. Gabriel", title = "Patterns of {Software}. {Tales} from the {Software} {Community}", publisher = "Oxford University Press", year = "1996", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", address = "New York, Oxford", OPTedition = "", OPTmonth = "", OPTnote = "", annote = "This is a striking, sometimes shocking and sometimes amusing collection of essays by a man who (as one of the developers of Lisp) has closely been related to the emergence of high level programming languages and the entire software engineering discipline for about 20 to 30 years. Gabriel not only gives inspiring insights into the benefits of small systems ({\`a} la Siefkes), good documentation ({\`a} la Knuth's literate programming) and what makes a programming language good vs. widely accepted, but also tells instructive tales about the rise and fall of his own company, Lucid, during the 90s. A well readable book written in almost spoken language and with sometimes a little ``diffusing'' sequences of ideas, but with a lot of perfectly arguable points, which makes this book a good starting point for discussions on software engineering (along with Brook's ``No Silver Bullet'' \cite{Brooks:1987:NSB})." } @Article{Garg:1996:DSU, author = {V. K. Garg and Brian Waldecker}, title = {Detection of strong unstable predicates in distributed programs}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {1996}, OPTkey = {}, volume = {7}, number = {12}, OPTmonth = {}, pages = {1323--1333}, OPTnote = {}, annote = "Angaben aus \cite{Stoller:1997:DGP}." } @Book{Garg:1996:PDS, author = {Vijay K. Garg}, title = {Principles of Distributed Systems}, publisher = {Kluwer Academic Publishers}, year = {1996}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Boston, MA}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, OPTannote = {} } @InProceedings{Guerraoui:1996:CS, author = "R. Guerraoui and A. Schiper", title = "Consensus Service: {A} Modular Approach for Building Agreement Protocols in Distributed Systems", pages = "168--177", ISBN = "0-8186-7261-7", booktitle = pro-ftcs96, month = jun # "25--27~", publisher = "IEEE", address = "Washington", year = "1996", annote = "[to read]" } @InProceedings{Guerraoui:1996:GAF, title = "``{Gamma}-Accurate'' Failure Detectors", author = "Rachid Guerraoui and Andr{\'e} Schiper", booktitle = "Distributed Algorithms, 10th International Workshop, {WDAG} '96", editor = "{\"O}zalp Babaoglu and Keith Marzullo", address = "Bologna, Italy", month = "9--11~" # oct, year = "1996", series = "Lecture Notes in Computer Science", volume = "1151", publisher = pub-SV, ISBN = "ISBN 3-540-61769-8", pages = "269--286", annote = "[to read]" } @PhdThesis{Hoefling:1996:MFP, author = {T. {H\"ofling}}, title = {{Methoden zur Fehlererkennung mit Parametersch\"atzung und Parit\"atsgeleichungen}}, school = {Technische Hochschule Darmstadt}, year = {1996}, OPTkey = {}, OPTtype = {}, OPTaddress = {}, OPTmonth = {}, OPTnote = {{erschienen als Fortschr. Ber. VDL, VDI-Verlag, D\"usseldorf}}, OPTannote = {} } @InProceedings{Hurfin:1996:ODC, author = "Michel Hurfin and M. Mizuno and Michel Raynal and M. Singhal", title = "On-the-Fly Detection of Conjunctions of Local Predicates in Distributed Computations", pages = "589--592", booktitle = "Eighth {IEEE} Symposium on Parallel and Distributed Processing ({SPDP}'96)", ISBN = "0-8186-7683-3", month = oct, publisher = "IEEE Computer Society", address = "Washington", year = "1996", annote = "[to get]" } @Article{Hutter:1996:VSE, author = {Dieter Hutter and Bruno Langenstein and Claus Sengler and {J\"org} H. Siekmann and Werner Stephan and Andreas Wolpers}, title = {Verification Support Environment {(VSE)}}, journal = {High Integrity Systems}, year = {1996}, OPTkey = {}, volume = {1}, number = {6}, pages = {523--530}, OPTmonth = {}, OPTnote = {}, annote = {Gives an overview of VSE-I. Good reference. For VSE-II, better cite \cite{Hutter:1998:VSE}.} } @Article{Isermann:1996:MUE, author = {Rolf Isermann}, title = {{Modellgest\"utzte \"Uberwachung und Fehlerdiagnose technischer Systeme}}, journal = {Automatisierungstechnische Praxis}, year = {1996}, OPTkey = {}, volume = {38}, OPTnumber = {}, pages = {9--20, 48--57}, OPTmonth = {}, OPTnote = {}, annote = {[Angabe von Armin]} } @phdthesis{Janowski:1996:BAF, author = "T. Janowski", title = "Bisimulation and Fault-Tolerance", number = "CS-THESIS-JANOWSKI96", year = "1996", month = "February", type = "Thesis", url = "http://www.dcs.warwick.ac.uk/pub/reports/theses/jan96.html", school = "Department of Computer Science, University of Warwick", address = "Coventry, UK", note = {Also University of Warwick Department of Computer Science Research Report CS-RR-300}, abstract = { In the area of concurrent, communicating systems, a common approach to verify the absence of design faults is in terms of an equivalence relation between a high-level and a low-level process. One such relation is bisimulation and this holds if two processes cannot be distinguished by observing them for a finite interval of time. However, the absence of design faults does not guarantee that the process will behave correctly in practice as it depends on various hardware devices which may be subject to physical faults themselves. Such faults cannot be avoided; they must be tolerated. The purpose of this thesis is to provide a formal framework, based on bisimulations and using the Calculus of Communicating Systems, by which we can specify, design and verify concurrent, fault-tolerant systems, with emphasis placed on reasoning and design under weak assumptions about faults. }, annote = {[to get, requested from Warwick]} } @InProceedings{Liu:1996:VFR, author = "Z. Liu and M. Joseph", title = "Verification of Fault Tolerance and Real Time", pages = "220--229", ISBN = "0-8186-7261-7", booktitle = pro-ftcs96, month = jun, publisher = "IEEE", address = "Sendai, Japan", year = "1996", annote = "Programs and specifications are viewed as formulas in the same logic (originally an idea of \cite{Pnueli:1981:TSC} explained in \cite{Abadi:1995:CS,Abadi:1993:CS,Lamport:1989:SAS}). The logic used here is TLA \cite{Lamport:1994:TLA} and the programming notation are action systems (i.e. simple automata). Both formalisms are related and it is shown how to transform action systems into TLA. Here, only TLA formulas are studied which are safety properties, i.e. only safety properties of action systems are discussed. The development of a program $P$ from a specification $S$ can be viewed as a sequence of refinement steps $P<P_n<\ldots P_1=S$ starting with $S$ and ending with $P$ where in each step the lower level version of the program is shown to implement the higher level version (this is done using some refinement calculus, e.g. \cite{Abadi:1991:ERM}). A program $P$ which implements $S$ in fault-free operations may not do so in the presence of physical faults. Faults are modeled as a set $f$ of fault operations on the system state and the effect of faults is viewed as a transformation $F(P,f)$ which is an interleaved execution of $P$ and $f$. The transformed program is called the ``$f$-affected'' version of $P$. If the $f$-affected version of $P$ satisfies some property $q$ and $q$ is the specification of some program $P'$ then $P$ is the $f$-tolerant refinement of $P'$, denoted $P<_f P'$. The relation $<_f$ is stronger that the ordinary refinement relation $<$ and generally is not reflexive (why?). But it is somewhat transitive: if $P_1<_{f_1}P_2$ and $P_2<_{f_2}P_3$ then $P_1<_{f_1}P_3$ ! Apart from an actions set $f$ a fault model requires a behavioral specification called ``behavioral fault assumption''. This is analogous to the rely specification of \cite{Voelzer:1998:VFT}. Generally, this is a safety property (as conjectured in \cite{Gaertner:1999:ESD}) so it can be ``implemented'' within $f$. Separation of fault actions and behavioral fault assumption usually makes specification easier. Proving the fault tolerance properties of some program results in proving that a program is a fault tolerant refinement of another. Real time is basicly handled by adding a clock and formulating real time properties as safety properties. Section 5 discusses related work: \cite{Liu:1992:TPF} presents methods how to obtain fault-tolerant refinements of programs, other work is \cite{Liu:1994:SDF}. It is noted that these methods can be used to prove fault-tolerant algorithms using PVS. Transformational approaches are independent of formalism (\cite{Nordahl:1993:DFD} uses CSP, \cite{Janowski:1996:BAF} uses CCS." } @Book{Lynch:1996:DA, author = {Nancy Lynch}, title = {Distributed Algorithms}, publisher = {Morgan Kaufmann, San Mateo, CA}, year = {1996} } @INPROCEEDINGS{Mizuno:1996:TBT, AUTHOR = "Masaaki Mizuno and Hirotsugu Kakugawa", TITLE = "A timestamp based transformation of self-stabilizing programs for distributed computing environments", BOOKTITLE = pro-wdag96, YEAR = 1996, PAGES = "304--321", annote = "In the serial model, an atomic execution step consists of a read-sub-step, where processes read the state of their neighbours, followed by a local state change. Each process can always see the states of one of its neighbours and only one process at a time executes an atomic step. In the distributed model, an atomic execution step is either a read-sub-step or a local state change based on its own state and the locally recorded neighbours' states. In this paper the authors present a method to transform an algorithm from the serial model to an algorithm from the distributed model and show that the self-stabilization property is preserved during transformation. The idea of the scheme is to simulate the serial model by imposing a transaction-commit protocol on every execution step of the original algorithm. As an execution step corresponds to a transaction, a lot of theorems from serializability theory may be applied. The criterion to prove serializability of the transformed program bases on timestamps from Lamport logical clocks. Correctness and message complexity depends on the usual prerequisites of reaching consensus and the carefull choice of timeout values." } @inproceedings{Owre:1996:PVS, TITLE = {{PVS}: Combining Specification, Proof Checking, and Model Checking}, AUTHOR = {S. Owre and S. Rajan and J.M. Rushby and N. Shankar and M.K. Srivas}, BOOKTITLE = {Computer-Aided Verification, CAV '96}, EDITOR = {Rajeev Alur and Thomas A. Henzinger}, PAGES = {411--414}, PUBLISHER = pub-SV, SERIES = {Lecture Notes in Computer Science}, NUMBER = 1102, MONTH = {July/August}, YEAR = 1996, ADDRESS = {New Brunswick, NJ} } @Book{Spies:1996:FSS, ALTauthor = {}, editor = {Katharina Spies and Manfred Broy and Stephan Merz}, title = {Formal Systems Specification: The RPC-Memory Specification Case Study}, publisher = pub-SV, year = {1996}, OPTkey = {}, OPTvolume = {}, number = {1169}, series = ser-LNCS, OPTaddress = {}, OPTedition = {}, month = dec, OPTnote = {}, annote = {A collection of papers from a Dagstuhl seminar 9439 in 1994 where a sample poblem is specified and verified in many different formalisms. There's also a Dagstuhl report with abstracts on this.} } @Book{Tanenbaum:1996:CN, author = "Andrew S. Tanenbaum", title = "Computer Networks", publisher = pub-PH, year = "1996", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", address = pub-PH:adr, edition = "Third", OPTmonth = "", OPTnote = "", annote = "The well-known bestseller." } @InProceedings{Vogels:1996:WWF, author = {Werner Vogels}, title = {World Wide Failures}, booktitle = {Proceedings of the ACM SIGOPS European Workshop}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {1996}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Connemara, Ireland}, month = sep, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {Discusses practical concerns in bulding reliable failure detectors. Explicitly references \cite{Chandra:1996:UFD} and presents some timeout measurements. It is said that a full paper with detailed results will appear soon. Has ist already? A good complement to \cite{Sergent:1999:FDI}.} } @InProceedings{Zhou:1996:FNP, author = "Jianying Zhou and Dieter Gollmann", title = "A Fair Non-repudiation Protocol", keywords = "non-repudiation, trusted third party", pages = "55--61", year = "1996", booktitle = "Proceedings of the IEEE Symposium on Research in Security and Privacy", address = "Oakland, CA", year = "1996", publisher = pub-IEEE, month = may, organization = "IEEE Computer Society,Technical Committee on Security and Privacy", annote = "[to get]" } @InProceedings{Afek:1997:LS, author = "Yehuda Afek and Shlomi Dolev", title = "Local Stabilizer", booktitle = pro-podc97, pages = "287--?", year = "1997", annote = "The authors present a protocol module which can be imposed onto arbitrary round based algorithms and turn it into a self-stabilizing algorithm. This is done much in the spirit of Katz and Perry \cite{Katz:1993:SEM} by using a detection protocol and a repair protocol. The detection protocol part sends the complete state of a node to all its neighbours in every round. After $d$ rounds, and by forwarding states from neighbours, every node will be able to construct a pyramid of local snapshots of all nodes within diameter $d$ of itself. Level $k$ of the pyramid reflects the state of $k$-distant node before $k$ rounds. Not only the state is forwarded, but also the inputs to the node before that round, so a remote node can check what the node in question was supposed to be doing and can detect inconsistencies immediately. On detecting an inconsistency, the repair mechanism freezes the outer network and diffuses the ``right'' state to all processes within the infected portion of the network. In case this is not possible (because the majority of nodes has been perturbed for example), a reset procedure is invoked. This paper contains some very clever ideas that have a lot of potential for optimization. The pyramid of states however and checking the consistency implies that every node does all the computations of all other nodes. Together with the round based model this implements an omniscient observer at every node that takes snapshots at the beginning of every round. Because of the round synchronization, these snapshots must be identical at every node. Thus inconsistencies can be detected. Stabilization time is fast, but a huge amount of space needed." } @InProceedings{Aguilera:1997:HTF, author = "Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg", title = "Heartbeat: a timeout-free failure detector for quiescent reliable communication", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "1320", OPTseries = ser-LNCS, pages = "126--140", booktitle = pro-WDAG97, year = "1997", OPTorganization = "", OPTpublisher = "Springer", OPTaddress = "", month = sep, OPTnote = "", annote = "The authors consider the problem of reliable communication within quiescent algorithms, i.e. algorithms that eventually stop sending messages, in asynchronous systems with lossy links. They solve the problem using a novel failure detector called `heartbeat'. This failure detector is a vector of size $n$ within each node, where $n$ is the number of neighbours the node has (one entry per neighbour). The value of slot $i$ increments if an alive signal (message) has been received by neighbour $i$. It is shown that reliable communication can be achieved in such settings using heartbeat but it seems that the problems of timeouts and synchrony are moved one level downwards. The authors argue that this is okay since the failure detector may be shared by other system modules and there is no `terminating' version of failure detectors anyway. The authors claim that heartbeat is implementable and give evidence in which they use the term ``periodically'' instead of ``timeout''. The main novelty with heartbeat is that it has an infinite range, i.e. it outputs infinite values (in contrast to previous versions that output finite lists of suspects). Apart from this last point, this paper is a good starting point for finding literature on failure detection." } @TechReport{Aguilera:1997:QRC, title = "Quiescent Reliable Communication and Quiescent Consensus in Partitionable Networks", author = "Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg", year = "1997", institution = "Cornell University, Computer Science Department", month = jun, pages = "24", number = "TR97-1632", abstract = "We consider partitionable networks with process crashes and lossy links, and focus on the problems of reliable communication and consensus for such networks. For both problems we seek algorithms that are quiescent, i.e., algorithms that eventually stop sending messages. We first tackle the problem of reliable communication for partitionable networks by extending the results of [ACT97a]. In particular, we generalize the specification of the heartbeat failure detector HB, show how to implement it, and show how to use it to achieve quiescent reliable communication. We then turn our attention to the problem of consensus for partitionable networks. We first show that, even though this problem can be solved using a natural extension of <>S, such solutions are not quiescent --- in other words, <>S alone is not sufficient to achieve quiescent consensus in partitionable networks. We then solve this problem using <>S and the quiescent reliable communication primitives that we developed in the first part of the paper. Our model of failure detectors for partitionable networks, a natural extension of the model in [CT96], is also a contribution of this paper.", annote = "See the Journal version \cite{Aguilera:1999:UHF}." } @TechReport{Aguilera:1997:WFD, title = "On the Weakest Failure Detector for Quiescent Reliable Communication", author = "Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg", year = "1997", month = jul, pages = "16", number = "TR97-1640", institution = "Cornell University, Computer Science Department", abstract = "We consider the problem of achieving reliable communication with quiescent algorithms (i.e., algorithms that eventually stop sending messages) in asynchronous systems with process crashes and lossy links, and show that, among failure detectors with bounded output size, <>P is the weakest one that can be used to solve this problem. Combined with a result in [ACT97a], this shows that failure detectors that are commonly used in practice, i.e., those that output lists of suspects, are not always the best ones to solve a problem.", annote = "[to read]" } @Article{Alur:1997:TAA, title = "Time-Adaptive Algorithms for Synchronization", author = "Rajeev Alur and Hagit Attiya and Gadi Taubenfeld", pages = "539--556", journal = "SIAM Journal on Computing", month = apr, year = "1997", volume = "26", number = "2", annote = "Proves that time is insignificant to safety properties. Referenced in \cite{Merritt:1998:FSO}. Looks at consensus and mutual exclusion in shared memory environments that have an unknown upper bound on memory access times." } @Misc{Arora:1997:OCC, OPTkey = {}, author = {Anish Arora and Mohamed G. Gouda}, title = {On the correctness criteria of load balancing programs}, howpublished = {Internet: ftp://ftp.cis.ohio-state.edu/pub/anish/papers/load-balancing.ps.gz}, month = apr, year = {1997}, note = {}, annote = {revised version of \cite{Arora:1995:ECC}, submitted to IEEE TPDS.} } @InProceedings{Asokan:1997:OPF, author = "N. Asokan and M. Schunter and M. Waidner", title = "Optimistic Protocols for Fair Exchange", pages = "8--17", booktitle = "4th {ACM} Conference on Computer and Communications Security", address = "Zurich, Switzerland", year = "1997", publisher = "ACM Press", month = apr, editor = "Tsutomu Matsumoto", annote = "[to get]" } @Article{Avizienis:1997:TSD, author = "Algirdas Avizienis", title = "Toward Systematic Design of Fault-Tolerant Systems", journal = j-IEEE-COMPUTER, volume = "30", number = "4", pages = "51--58", month = apr, year = "1997", annote = "[to read]" } @TechReport{Babaoglu:1997:PGM, author = "{\"O}zalp {Babao\u{g}lu} and Renzo Davoli and Albert Montresor", title = "Partitionable group membership: specification and algorithms", institution = "Department of Computer Science, University of Bologna, Italy", year = "1997", OPTcrossref = "", OPTkey = "", OPTtype = "", number = "UBLCS-97-1", OPTaddress = "", month = jan, note = "Revised May 1997.", OPTannote = "[to read]" } @InProceedings{Beauquier:1997:OFS, title = "On {FTSS}-Solvable Distributed Problems", author = "Joffroy Beauquier and Synn{\"o}ve Kekkonen-Moneta", pages = "290", booktitle = "Proceedings of the Sixteenth Annual {ACM} Symposium on Principles of Distributed Computing", address = "Santa Barbara, California", month = "21--24~" # aug, year = "1997", annote = "Brief announcement at PODC, 1 page only. See also \cite{Beauquier:1996:MFH,Beauquier:1997:FTS,Kekkonen:1998:RFA}." } @Article{Beauquier:1997:FTS, author = {Joffroy Beauquier and {Synn\"ove} Kekkonen-Moneta}, title = {Fault-tolerance and self-stabilization: impossibility results and solutions using self-stabilizing failure detectors}, journal = {International Journal of System Science}, year = {1997}, OPTkey = {}, volume = {28}, number = {11}, pages = {1177--1187}, OPTmonth = {}, OPTnote = {}, annote = {A rounding-up paper of previous work in fault-tolerance and self-stabilization started with \cite{Gopal:1993:USF} and \cite{Anagnostou:1993:TTP}. The authors show that the transformation of a fault-tolerant protocol into a fault-tolerant self-stabilizing (ftss) protocol (performed in synchronous environments in \cite{Gopal:1993:USF}) cannot be extended to asynchronous environments because it is impossible to distinguish a slow process from a crashed one. Then, they show that the impossibility result of \cite{Anagnostou:1993:TTP} (which also rests on the necessity to distinguish a crashed from a slow process) can be extended to a class of network. These results can be circumvented by adding some synchrony to the model in the form of failure detectors (in the spirit of \cite{Chandra:1996:UFD}). The synchrony assumption here is called ``fair communication'', meaning that a correct process can receive only finitely many messages from any one correct neighbor before receiving a message from every other correct neighbor. (Processes are assumed to emit a message to every neighbor at every tick of their local clock.) This seems to be equivalent to a combination of stabilizing clock drift and stabilizing transmission delay. The authors give implementations for failure detectors based on this assumption for both cases whether or not the bound is know or only the time until it holds is unknown or not (in the spirit of partial synchrony \cite{Dwork:1988:CPP}). The ideas herein are exposed more elaborately in Kekkonen-Moneta's thesis \cite{Kekkonen:1998:RFA}.} } @PhdThesis{Borcherding:1997:AEB, author = {Malte Borcherding}, title = {{Authentifikationsvoraussetzungen f\"ur effiziente byzantinische \"Ubereinstimmung}}, school = {Universit\"at Karlsruhe, Fakult\"at f\"ur Informatik}, year = {1997}, OPTkey = {}, OPTtype = {}, OPTaddress = {}, OPTmonth = {}, note = {Logos-Verlag, Berlin}, annote = {Stellt mehrere Zwischengrade der Authentifikation vor, die effizientere Agreement-Algorithmen ermoeglichen. Implizit Definition einer deutschen Terminologie fuer Uebereinstimmungsprobleme.} } @Article{Chen:1997:FRC, author = "Biao Chen and Sanjay Kamat and Wei Zhao", title = "Fault-Tolerant, Real-Time Communication in {FDDI}-Based Networks", journal = j-IEEE-COMPUTER, volume = "30", number = "4", pages = "83--90", month = apr, year = "1997", annote = "[to read]" } @InProceedings{Dolev:1997:FDO, author = {Danny Dolev and Roy Friedmann and Idit Keidar and Dahlia Malkhi}, title = {Failure detectors in omission failure environments}, booktitle = pro-podc97, OPTcrossref = {}, OPTkey = {}, OPTpages = {186}, year = {1997}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTmonth = {}, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, OPTannote = {This is a brief announcement (1 page) at PODC97. See also the technical report version \cite{Dolev:1996:FDO} [to read]} } @Article{Dolev:1997:PIS, author = "Shlomi Dolev", title = "Possible and impossible self-stabilizing digital clock synchronization in general graphs", journal = "Journal of Real-Time Systems", volume = "12", number = "1", year = "1997", pages = "95--107", annote = "This paper contains a good general survey of clock synchronization in shared-memory multiprocessor systems with a general communication graph." } @Article{Dolev:1997:SRR, title = "Self-Stabilizing Routing and Related Protocols", author = "Shlomi Dolev", pages = "122--127", journal = "Journal of Parallel and Distributed Computing", year = "1997", volume = "42", number = "2", annote = "[to read]" } @TechReport{Doudou:1997:MDC, author = "Assai Doudou and {Andr\'e} Schiper", title = "Muteness detectors for consensus with {Byzantine} processes", institution = "EPFL -- {D\'epartement} d'Informatique, Lausanne, Switzerland", year = "1997", OPTcrossref = "", OPTkey = "", OPTtype = "", OPTnumber = "TR-97/230", OPTaddress = "", OPTmonth = oct, OPTnote = "", annote = "The authors extend the notion of failure detectors to the Byzantine failure model. Generally, a Byzantine process can do four things: (1) ignore another process, or (2) send garbled messages to another process, or (3) send messages that seem correct to another process but do not follow the protocol, or (4) skip protocol messages. To combat this type of faulty behavior the authors present a muteness failure detector. A process i is mute to a process j if there is a time after which i crashes, or i stops sending messages to j, or i sends only incorrectly signed messages or unsigned messages to j. Based on this definition, the muteness detector is defined in terms of the traditional eventual weak accuracy property and mute completeness, stating that eventually every process i which is mute to process j is permanently suspected by j. The muteness detector guards against (1) and (2). The behaviors (3) and (4) can be detected and corrected by usual methods to solve Byzantine agreement \cite{Lamport:1982:BGP}. Channels must be FIFO to be able to detect missing messages (previous solutions required causal message delivery \cite{Malkhi:1997:UID}). The authors adapt the consensus specification to Byzantine environments (resulting in a definition of the vector consensus problem) and give an algorithm that uses the muteness detector to achieve consensus in a Byzantine environment. The algorithm is based on the early consensus algorithm by Schiper \cite{Schiper:1997:ECA}. There's a mentioning of the echo broadcast technique for solving the Byzantine Generals Problem \cite{Lamport:1982:BGP} with signed messages." } @InProceedings{Fetzer:1997:FAA, author = "Christof Fetzer and Flaviu Cristian", title = "Fail-Awareness: An Approach to Construct Fail-Safe Applications", pages = "282--291", booktitle = "Proceedings of The Twenty-Seventh Annual International Symposium on Fault-Tolerant Computing ({FTCS}'97)", ISBN = "0-8186-7831-3", month = jun, publisher = "IEEE", year = "1997", annote = "The authors introduce the notion of fail-awareness as an approach to construct fail-safe applications. Fail-awareness is based on the idea that the underlying system is timed asynchronous, i.e., it is synchronous with a bound on timeliness and failure rate most of the time, and asynchronous in special cases. If such asynchronous phases can be detected, the affected parts of the system must switch into an exception mode that signals this fact to clients. In this way, the system may degenerate in a safe way. If synchronous performance is re-established, services may re-join and catch up again. Fail-awareness can be used to transform synchronous service specifications so that they become implementable in timed asynchronous systems. The detection of timeliness properties is based on synchronized clocks. A hierarchy of fail-aware services is presented. Overall, this paper is very dense and much of the details of protocols are left to other references which should be read to be convincing. It is another example that detection is a prerequisite of fail-safe or masking fault tolerance." } @inproceedings{Fetzer:1997:FAD, author = {Christof Fetzer and Flaviu Cristian}, title = {A Fail-Aware Datagram Service}, booktitle = {Proceedings of the 2nd Annual Workshop on Fault-Tolerant Parallel and Distributed Systems}, year = {1997}, month = {Apr}, address = {Geneva, Switzerland}, note = {\url{http://www-cse.ucsd.edu/users/cfetzer/FADS/fads.html}}, annote = "[to read]" } @InProceedings{Fetzer:1997:TAA, author = "Christof Fetzer and Shivakant Mishra and Flaviu Cristian", title = "The Timewheel Asynchronous Atomic Broadcast Protocol", booktitle = "International Conference on Parallel and Distributed Processing Techniques and Applications (PDPTA'97)", publisher = "IEEE", address = "Las Vegas, Nevada, USA.", month = jun, year = "1997", abstract = "http://www.cps.udayton.edu/\~{}pan/pdpta.", annote = "Presents a collection of several total order broadcast protocols. Ordering can be unordered, total or time order, atomicity can be weak, strong or strict. Focus is on performance issues, unlike \cite{Hadzilacos:1994:MAF}. I think there's also aomething called timewheel group membership." } @InProceedings{Franklin:1997:FES, author = "Matthew K. Franklin and Michael K. Reiter", title = "Fair Exchange with a Semi-Trusted Third Party", pages = "1--5", booktitle = "4th {ACM} Conference on Computer and Communications Security", address = "Z{\"u}rich, Switzerland", year = "1997", publisher = pub-ACM, month = apr, editor = "Tsutomu Matsumoto", annote = "active exchange? [to get]" } @Misc{Gaertner:1997:FRD, OPTcrossref = "", OPTkey = "", author = {Felix {G\"artner}}, title = "Fehlertolerante {Replikation} von {Diensten} mit schwacher {Konsistenz} mittels selbststabilisierender verteilter {Algorithmen}", howpublished = {Diplomarbeit DA-BS-1997-06 am Fachgebiet Betriebssysteme des Fachbereichs Informatik, Technische Universit"at Darmstadt}, year = "1997", month = "December", note = {Internet: \texttt{http://www.informatik.tu-darmstadt.de/\-$\tilde{}$felix/diploma}}, OPTannote = "" } @InProceedings{Garg:1997:OCD, author = "Vijay K. Garg", title = "Observation and control for debugging distributed computations", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "1--12", booktitle = "3rd Int. Workshop on Automated Debugging (AADEBUG 97)", year = "1997", OPTorganization = "", OPTpublisher = "", address = "Link{\"o}ping, Sweden", url = "\url{http://www.ep.liu.se/ea/cis/1997/009/}", month = may, OPTnote = "keynote presentation", annote = "As one of the ``big men'' in theory of distributed systems, Garg presents here an overview over the topics of observation and control of distributed computations. Objective of control is to either maintain an invariant on a global state or to ensure a proper order of events. Observation is used to monitor system actions. Three restrictions impose problems on observation: (1) the lack of shared clock can be alliviated by substituting causality for real time and detecting predicates transformed using `possibly' and `definitely'. Possibly true predicates are useful for detecting bad conditions, whereas definitely true predicates are useful to verify the occurence of good predicates. (2) The lack of shared memory can be alliviated by using the notion of monotonicity. A predicate is monotone with resprect to a variable if monotonic changing of that variable doesn't change the truth of the predicate. This allows us to restrict our attention to state intervals rather than states. This allows us to redice the number of events that must be inspected drastically. (3) Combinatorial explosion is combatted by the use of linear predicates. In general, detecting possibly is NP-complete. However, linear predicates can be detected efficiently: a predictate is linear if its value `false' can be detected ``locally'' (i.e., it contains a forbidden state of a process or channel). So conjunctions of local predicates can be efficiently detected. The paper briefly surveys some possibly detection algorithms and states some open problems. The it turns to the issue of control and discusses different modes (on-line, off-line) and methods (delaying/reordering events). Finally, a fictionous (but implementable) distributed debugger is decribed. Overall, this is a very fluent introductory paper to the issues of observation (and control) in distributed systems." } @InProceedings{Guerraoui:1997:CBM, author = "Rachid Guerraoui and {Andr\'e} Schiper", title = "Consensus: the big misunderstanding", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", OPTpages = "???", booktitle = "Proceedings of the 6th Workshop on Future Trends of Distributed Computing Systems (FTDCS-6)", year = "1997", OPTorganization = "", OPTpublisher = "", OPTaddress = "", month = oct, OPTnote = "", annote = "This paper tries to clarify six popular misunderstandings about the consensus problem that prevent consensus as being considered fundamental both in theory and in practice. The misunderstandings are: (1) Consensus is for theoreticians only, (2) Time-outs are enough, (3) There is no life after FLP, (4) The failure detector model is unrealistic, (5) Time-free means inefficient, (6) Asynchronous algorithms cannot be used for time critical applications. A very concise and well readable paper that does good summarizing work and is a good source for arguments." } @InProceedings{Guerraoui:1997:GAM, author = "Rachid Guerraoui and {Andr\'e} Schiper", title = "Genuine atomic multicast", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", number = "1320", series = ser-LNCS, pages = "141--154", booktitle = pro-WDAG97, year = "1997", OPTorganization = "", publisher = pub-SV, OPTaddress = "", month = sep, OPTnote = "", annote = "The authors define genuine atomic multicast to be an atomic multicast with a specific minimality property, i.e. that only the processes in the multicast group ``act'' and others remain ``quiet'' (this is as opposed to atomic multicast faked by an underlying atomic broadcast). They show that genuine atomic multicast is strictly stronger than atomic broadcast in that it needs a perfect failure detector (thus stricter synchrony assumptions) to be solvable in asynchronous systems. They argue, that it is exactly the minimality requirement that makes the problem unsolvable with unreliable failure detection." } @Article{Guerraoui:1997:SBR, author = "Rachid Guerraoui and Andr{\'e} Schiper", title = "Software-Based Replication for Fault Tolerance", journal = j-IEEE-COMPUTER, volume = "30", number = "4", pages = "68--74", month = apr, year = "1997", keywords = "correctness criterion; cost; fault tolerance; group communication; linearizability; message passing; off-the-shelf hardware; replicated servers; replicated service implementation techniques; reviews; software fault tolerance; software-based replication; specialized hardware; survey", treatment = "G General Review", annote = "This is a general survey over software based replication techniques to achieve fault tolerance with a strong emphasis on the relations to group communication and consensus. Issues of view-synchronous and totally ordered communication and their relation to consensus using unreliable failure detectors are discussed. In general, this is a lightweight overview paper that doesn't upset your tummy." } @Article{Hsueh:1997:FIT, author = "Mei-Chen Hsueh and Timothy K. Tsai and Ravishankar K. Iyer", title = "Fault Injection Techniques and Tools", journal = j-IEEE-COMPUTER, volume = "30", number = "4", pages = "75--82", month = apr, year = "1997", annote = "A survey over current fault injection techniques and tools. A good quote. Interesting are the different types of software fault injection techniques and their relations to the program transformational approach in describung failure models \cite{Gaertner:1998:SFT}. They have the same underlying idea but a different purpose: one is experimental (and dynamic) and the other is theoretical (and static). A German reference is \cite{Echtle:1998:FMB}." } @TechReport{Hurfin:1997:CAS, author = "Michel Hurfin and Achour {Most\'efaoui} and Michel Raynal", title = "Consensus in asynchronous systems where processes can crash and recover", institution = "Institut de Recherche en Informatique et Syst\`emes Al\'eatoires (IRISA)", year = 1997, number = 1144, address = "Campus de Beaulieu, 35042 Rennes Cedex, France", month = nov, annote = "[to read] surveyed in \cite{Aguilera:1998:FDC}. Published at SRDS'98 \cite{Hurfin:1998:CAS}." } @INPROCEEDINGS{Kakugawa:1997:DSD, AUTHOR = "Hirotsugu Kakugawa and Masaaki Mizuno and Mikhail Nesterenko", TITLE = "Development of self-stabilizing distributed algorithms using transformation: case studies", PAGES = "16-30", BOOKTITLE = pro-wss97, YEAR = 1997, annote = "The authors evaluate their transformation algorithm \cite{Mizuno:1996:TBT} from the serial model to the distributed model on several examples including lock-based mutual exclusion and leader election. They conclude that transformed algorithms have a larger message complexity (which depends on the choice of timeout values) but this is paid off by sparing the hassle of developing, debugging and verifying algorithms for the distributed model from scratch. Simulation results suggest, that both types of algorithms have the same asymptotic message complexity." } @TechReport{Kreitz:1997:FRC, author = "Christoph Kreitz", title = "Formal reasoning about communication systems {I}: {Embedding} {ML} into type theory", institution = "Cornell University", year = "1997", OPTcrossref = "", OPTkey = "", OPTtype = "", number = "TR97-1637", address = "Ithaca", month = jul, OPTnote = "", annote = "Abstract: We present a semantically correct embedding of a subset of the Ocaml programming language into the type theory of NuPRL. The subset is that needed to build the Ensemble group communication system. We describe the essential methodologies for representing language constructs by type-theoretical expressions. Tactics representing derived inference rules and a programming logic for these constructs will be discussed as well as algorithms for translating an Ocaml-program into NuPRL-objects and vice versa. The formal representations and the translation algorithms will serve as the foundation for the development of automated reasoning tools for the verification and optimization of a group communication systems. [(noch) nicht ausgedruckt]" } @Article{Kuhn:1997:SFP, author = "D. Richard Kuhn", title = "Sources of Failure in the Public Switched Telephone Network", journal = j-IEEE-COMPUTER, volume = "30", number = "4", pages = "31--36", month = apr, year = "1997", annote = "[to read]" } @InProceedings{Kulkarni:1997:CDM, author = "Sandeep S. Kulkarni and Anish Arora", title = "Compositional design of multitolerant repetitive {Byzantine} agreement", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", OPTpages = "", booktitle = "Proceedings of the 18th International Conference on the Foundations of Software Technology and Theoretical Computer Science, Kharagpur, India", year = "1997", OPTorganization = "", OPTpublisher = "", OPTaddress = "", OPTmonth = "", OPTnote = "", annote = "Byzantine agreement is taken as an application example of building fault tolerant programs using the detectors and correctors methodology of Arora and Kulkarni \cite{Arora:1998:CDM}." } @InProceedings{Malkhi:1997:UID, author = "Dahlia Malkhi and Michael Reiter", title = "Unreliable Intrusion Detection in Distributed Computations", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "116--124", booktitle = "Proceedings of the 10th Computer Security Foundations Workshop (CSFW97)", year = "1997", OPTorganization = "", OPTpublisher = "", address = "Rockport, MA", month = jun, OPTnote = "", annote = "[to read]" } @Book{Menezes:1997:HAC, author = "Alfred J. Menezes and Paul C. Van Oorschot and Scott A. Vanstone", title = "Handbook of Applied Cryptography", publisher = "CRC Press, Boca Raton, FL", year = "1997", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", OPTaddress = "", OPTedition = "", OPTmonth = "", OPTnote = "", annote = "Brilliant and beautiful book on all aspects of cryptography with a strong practical perspective without diving into source code (like Schneier)." } @Article{Nelles:1997:NNI, author = {O. Nelles and S. Ernst and R. Isermann}, title = {{Neuronale Netze zur Identifikation nichtlinearer dynamischer Systeme: ein \"Uberblick}}, journal = {Automatisierungstechnik}, year = {1997}, OPTkey = {}, volume = {45}, number = {6}, pages = {251--262}, OPTmonth = {}, OPTnote = {}, annote = {[Angabe von Armin]} } @TechReport{Oliveira:1997:CCR, author = "Rui Oliveira and Rachid Guerraoui and {Andr\'e} Schiper", title = "Consensus in the crash-recover model", institution = "EPFL -- {D\'epartment} d'Informatique, Lausanne, Switzerland", year = "1997", OPTcrossref = "", OPTkey = "", OPTtype = "", number = "TR-97/239", OPTaddress = "", month = aug, OPTnote = "", annote = "[to read] surveyed in \cite{Aguilera:1998:FDC}." } @InProceedings{Pagnia:1997:TMP, title = "Towards Multiple-Payment Schemes for Digital Money", author = "Henning Pagnia and Ralph Jansen", pages = "203--215", booktitle = "Financial Cryptography: First International Conference, {FC}~'97", editor = "Rafael Hirschfeld", series = ser-LNCS, volume = "1318", year = "1997", month = "24--28~" # feb, address = "Anguilla, British West Indies", publisher = pub-SV, ISBN = "3-540-63594-7", references = "{CRYPTO::Brands1993} {CRYPTO::chaumFN1988} {EUROCRYPT::ChaumP1992} {CRYPTO::Ferguson1993} {EUROCRYPT::Jakobsson1995}", annote = "[to read] reinvented in \cite{Riordan:1998:CEP}." } @InProceedings{Prisco:1997:RPA, author = "Roberto De Prisco and Butler Lampson and Nancy Lynch", title = "Revisiting the Paxos Algorithm", booktitle = pro-wdag97, pages = "111--125", year = "1997", annote = "[to read]" } @Misc{Rock:1997:TSC, OPTkey = {}, author = {Georg Rock and Werner Stephan and Andreas Wolpers}, title = {Tool support for the compositional development of distributed systems}, howpublished = {\url{http://www.dfki.uni-sb.de/vse/papers/rsw97.ps.gz}}, month = may, year = {1997}, OPTnote = {}, annote = {[to read] published elsewhere?} } @INPROCEEDINGS{Rushby:1997:SFV, AUTHOR = {John Rushby}, TITLE = {Systematic Formal Verification for Fault-Tolerant Time-Triggered Algorithms}, BOOKTITLE = {Dependable Computing for Critical Applications---6}, SERIES = {Dependable Computing and Fault Tolerant Systems}, Volume = 11, YEAR = 1997, EDITOR = {Mario Dal Cin and Catherine Meadows and William H. Sanders}, PUBLISHER = {IEEE Computer Society}, ADDRESS = {Garmisch-Partenkirchen, Germany}, MONTH = mar, PAGES = {203--222}, annote = "Rushby argues for the separation of algorithm functionality and timeliness properties. Proofs for time-critical modules can be quite combersome if they are tried as is, but they can become much simpler if the abstract functionality is proven correct and they are then embedded into a real-time environment in a safe way by a once-and-for-all proven methodology (an idea also proposed by Le Lann \cite{LeLann:1995:ORN}). Rushby presents such a transformation for (synchronous) round based algorithms: such an algorithm can be mechanically transformed into a time-triggered implementation with tight real-time bounds mechanically. The case is made by transforming the famous oral message BGP protocol \cite{Lamport:1982:BGP} into a time-triggered version by hand and using the PVS automated proof system." } @Article{Schiper:1997:ECA, author = "{Andr\'e} Schiper", title = "Early consensus in an asynchronous system with a weak failure detector", OPTcrossref = "", OPTkey = "", journal = j-DC, year = "1997", volume = "10", number = "3", pages = "149--157", OPTmonth = "", OPTnote = "", annote = "The author presents a new algorithm for consensus in asynchronous systems which is an improvement over the original algorithm by Chandra and Toueg \cite{Chandra:1996:UFD}. Both use an unreliable eventually strong failure detector. The new early consensus algorithm uses the rotating coordinator paradigm and proceeds in asynchronous rounds. At the beginning of a round, the coordinator sends its estimate to all and tries to impose this value on the rest. A process receiving this estimate reissues it to all. As soon as a process receives this estimate from a majority of processes, it decides on that estimate. The algorithm ensures that once a majority of processes have adopted the same estimate, this value is locked and doesn't change anymore. So once a process decides, all other processes that decide do not decide differently. The failure detector ensures the liveness of the protocol. In comparison to the original Chandra/Toueg algorithm (CT) early consensus uses $n(n-1)$ messages to reach a decision in point-to-point networks while CT uses $3(n-1)$ messages. However, the decision value must be sent to all (to cater for failure cases), and so both algorithms needs an additional $n(n-1)$ messages for the total execution. Both therefore have $O(n^2)$ message complexity. However, early consensus has a lower latency degree. The latency degree is defined to be the largest timestamp of logical time, where ``messages tick''. This is a more precise measure for the number of rounds that an algorithm needs to execute. Early consensus has a latency degree of 2, whereas CT has a latency degree of 4 (easily optimzed to 3). The efficiency stems from improving parallelism by adding messages in the second part of a round. So early consensus is both an improvment in simplicity as it is in efficiency: See also Erratum \cite{Schiper:1997:EEC}." } @Article{Schiper:1997:EEC, author = "{Andr\'e} Schiper", title = "Erratum: Early consensus in an asynchronous system with a weak failure detector", journal = j-DC, year = "1997", volume = "10", pages = "198", annote = "corrections of lines 34 and 46 in Figure 1 of page 153." } @InProceedings{Setz:1997:DIA, author = {Thomas Setz}, title = {Design, implementation and performance of a fault tolerant tuple space machine}, booktitle = {Proceedings of the International Conference on Parallel and Distributed Systems (ICPADS'97)}, OPTcrossref = {}, OPTkey = {}, pages = {10--13}, year = {1997}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Seoul, Korea}, month = dec, OPTorganization = {}, publisher = pub-IEEE-CSP, OPTnote = {}, annote = {Conference version of \cite{Setz:1997:DIP}.} } @TechReport{Setz:1997:DIP, author = {Thomas Setz}, title = {Design, Implementation and Performance of a Mutex-Token based Fault-Tolerant Tuple Space Machine}, institution = {Sonderforschungsbereich 124, Universit{\"a}t des Saarlandes}, year = {1997}, OPTkey = {}, OPTtype = {}, number = {SFB 124 - 09/1997, TP D5}, address = {Fachbereich Informatik, 66041 {Saarbr\"ucken}, Germany}, month = jul, OPTnote = {}, url = "\url{http://cdc-server.cdc.informatik.tu-darmstadt.de/home/LiPS/LiPS/documentation/objects/doc/html/papers/FTTM/FTTM_technical_report_sb/FTTM_technical_report_sb.html}", annote = {Introduction to LiPS and description of the memberschip protocol used to make the tuple space engine fault tolerant. Appeared at ICPADS'97 \cite{Setz:1997:DIA}.} } @InProceedings{Sims:1997:RMS, author = "J. T. Sims", title = "Redundancy Management Software Services for Seawolf Ship Control System", pages = "390--394", booktitle = pro-ftcs97, ISBN = "0-8186-7831-3", month = jun, publisher = "IEEE", address = "Washington - Brussels - Tokyo", year = "1997", annote = "Seawolf is a ``new'' class of US Navy attack submarines. Its computer system is quadruply redundant with four independent fault containment regions which use Byzantine tolerant voting to achieve consensus on output. The voting process is implemented in simple hardware and also is quadruply redundant. The system is masking tolerant against upt to two non-simultaneous permanent faults before it is fail-safed. Faulty components can be exchanged online. The processors operate in lock-step synched mode. Fault detection and isolation methods and reconfiguration facilities are also described." } @InProceedings{Stoller:1997:DGP, author = {Scott D. Stoller}, title = {Detecting Global Predicates in Distributed Systems with Clocks}, booktitle = pro-wdag97, OPTcrossref = {}, OPTkey = {}, OPTeditor = {Marios Mavronicolas and Philippas Tsigas}, OPTvolume = {}, OPTnumber = {1320}, OPTseries = ser-LNCS, year = {1997}, OPTorganization = {}, OPTpublisher = pub-SV, OPTaddress = {}, month = sep, pages = {185--199}, OPTnote = {}, annote = "Stoller proposes a generalization of predicate detection in distributed computations based on lattice theory: he shows that any partial order with certain properties can be used to reason about consistent global states. From such an ordering follow generic definitions of the modalities `possibly' and `definitely' introduced by Cooper and Marzullo \cite{Cooper:1991:CDG}. The author instantiates his generic definitions with two orders which are based on the values of synchronized clocks. The first is called `definitely occured before' and the second `possibly occured before'; he also presents adaptions of known algorithms to detect them. Such algorithms can be optimized if the predicate has a certain (conjunctive) form (analogous to local detectability in constraint satisfaction \cite{Arora:1996:CSB}). A combination of possibly and definitely called `instantaneously' (or `properly') is introduced and discussed. Application of the results is seen in online monitoring and debugging of distributed applications, not in fault tolerance, although the example of debugging database coherence protocols is near to detecting illegal states." } @TechReport{Weber:1997:DAW, author = "Michael Weber and Rolf Walter and Hagen {V\"olzer} and Tobias Vesper and Wolfgang Reisig and Sibylle Peuker and Ekkart Kindler and {J\"orn} Freiheit and {J\"org} Desel", title = "{DAWN}: {Petrinetzmodelle} {zur} {Verifikation} {Verteilter} {Algorithmen}", institution = "Humboldt-{Universit\"at} Berlin, Institut {f\"ur} Informatik", year = "1997", OPTcrossref = "", OPTkey = "", type = "Informatik-Bericht", number = "88", address = "Unter den Linden 6, D-10099 Berlin", month = dec, OPTnote = "", OPTannote = "[to read]" } @Misc{Wilhelm:1997:CPO, author = "Uwe G. Wilhelm", title = "Cryptographically Protected Objects", month = may, year = 1997, note = "A french version appeared in the Proceedings of RenPar'9, Lausanne, CH. {{\tt http://lsewww.epfl.ch/\~{}wilhelm/CryPO.html}}", annote = "presents the idea of a tamper proof computing environment." } @TechReport{Aguilera:1998:FDCTR, title = "Failure Detection and Consensus in the Crash-Recovery Model", author = "Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg", number = "TR98-1676", year = "1998", institution = "Cornell University, Computer Science Department", abstract = "We study the problems of failure detection and consensus in asynchronous systems in which processes may crash and recover, and links may lose messages. We first propose new failure detectors that are particularly suitable to the crash-recovery model. We next determine under what conditions stable storage is necessary to solve consensus in this model. Using the new failure detectors, we give two consensus algorithms that match these conditions: one requires stable storage and the other does not. Both algorithms tolerate link failures and are particularly efficient in the runs that are most likely in practice --- those with no failures or failure detector mistakes. In such runs, consensus is achieved within 3d time and with 4n messages, where d is the maximum message delay and n is the number of processes in the system.", month = apr, annote = "The authors extend the work on asynchronous consensus using unreliable failure detectors to a more severe fault model than previous research has considered: now nodes may crash and recover, and links may lose messages. The authors first derive specifications for failure detectors which are better suited for this new fault model than those proposed in earlier papers by other authors. They do this by showing that the usual strong completeness property for the crash-recovery model (stating that eventually every bad process is permanently suspected by all good processes) is too strong because these detectors have to make predictions on the future behavior of other processes. They propose a new form of failure detectors with an infinite output domain and with different properties that circumvents the problems of the previous specification. Next, the authors identify, under what conditions stable storage is necessary to solve consensus in such an environment. They show that as long as the number of always-up processes is less or equal to the number of eventually-down processes consensus cannot be reached even if links do not lose messages and an eventually perfect failure detector can be used. Saving the proposed/decision values on stable storage does not help if there are additionally more than two eventually-down processes. However, if there are more always-up processes than bad processes consensus can be solved even without stable storage (two increasingly efficient algorithms are given). With stable storage consensus is solvable if there is a majority of good processes in the system (an algorithm is given). So, as long as one can guarantee that more processes never crash than those processes that are unstable or will eventually remain down, stable storage is not needed. If all processes may crash at least once, stable storage and a majority of good processes is needed to solve consensus. All results hold for fair lossy channels." } @InProceedings{Aguilera:1998:FDC, author = {Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg}, title = {Failure Detection and Consensus in the Crash-Recovery Model}, booktitle = {Proceedings of the 12th International Symposium on Distributed Computing (DISC)}, OPTcrossref = {}, OPTkey = {}, pages = {231--245}, year = {1998}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = ser-LNCS, OPTaddress = {}, month = sep, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {Described in \cite{Aguilera:1998:FDCTR}, this is a more citeable reference. Published in DC in 2000 \cite{Aguilera:2000:FDC}.} } @Article{Akguel:1998:ICS, author = {Tayfun {Akg\"ul}}, title = {International Conference on Self-Similar Systems (Cartoon)}, journal = {IEEE -- The Institute}, year = {1998}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, pages = 10, month = sep, OPTnote = {}, annote = {Shows some linguistic resemblance to WSS.} } @Article{Akguel:1998:TZT, author = {Tayfun {Akg\"ul}}, title = {Teaching the Z-Transform (Cartoon)}, journal = {IEEE -- The Institute}, year = {1998}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, pages = {12}, month = nov, OPTnote = {}, annote = {Shows a professor talking to a ZZZZ-sleeping audience.} } @InProceedings{Almeida:1998:ULG, author = {Carlos Almeida and Paulo Ver\'{\i}ssimo}, title = {Using light-weight groups to handle timing failures in {\em quasi-synchronous} systems}, booktitle = {Proceedings of the 19th IEEE Real-Time Systems Symposium}, year = 1998, address = {Madrid, Spain}, month = dec, annote = "covers part of the work described in \cite{Almeida:1998:QSA}." } @TechReport{Almeida:1998:QSA, author = "Carlos Almeida and Paulo {Ver{\'\i}ssimo} and {Ant\'{o}nio} Casimiro", title = "The quasi-synchronous approach to fault-tolerant and real-time communication and processing", institution = {Instituto Superior T\'{e}cnico}, year = 1998, number = {CTI RT-98-04}, address = {Lisboa, Portugal}, month = jul, annote = "The authors propose a new system model to use for large-scale fault-tolerant distributed systems, the quasi-synchronous approach. The authors augment the asynchronous model by adding a timing failure detector to the system. A timing failure detector can perfectly detect the non-timeliness of certain events within a fixed period of time. With such a failure detector it is possible to build reliable systems in asynchronous environments because it is essentially a perfect failure detector as described by Chandra and Toueg \cite{Chandra:1996:UFD}. The authors argue that such a failure detector can be implemented over modern ``synchronous'' network communications like ATM or GSM. Thus, only part of the system (control channels vs. payload channels) need be synchronous, easing the burdon of practical implementations. The authors give excellent reviews of the current work in this area and show several ways how the timeliness properties of quasi-synchronous applications can be increased: (1) by an early delivery causal atomic broadcast, (2) by dynamically adjusting the QoS (and thus timeliness deadlines) and (3) by active replication to limit response times of servers. Overall, this is an excellent paper which is also suited as an introduction to the area (after reading \cite{Chandra:1996:UFD}). Previous ideas appeared in other form in \cite{Almeida:1996:TFD,Verissimo:1995:QSS}." } @InProceedings{Arndt:1998:DLD, author = {Olaf Arndt and Bernd Freisleben and Thilo Kielmann and Frank Thilo}, title = {Dynamic load distribution with the {WINNER} system}, booktitle = {Proceedings of the Workshop ``Anwendungsbezogene Lastverteilung'' (ALV'98)}, OPTcrossref = {}, OPTkey = {}, pages = {77--88}, year = {1998}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {M\"unchen, Germany}, OPTmonth = {}, organization = {Technische Universit\"at M\"unchen}, OPTpublisher = {}, OPTnote = {}, annote = {} } @Article{Arora:1998:CDM, author = "Anish Arora and Sandeep S. Kulkarni", title = "Component based design of multitolerant systems", OPTcrossref = "", OPTkey = "", journal = j-IEEE-TRANS-SOFTW-ENG, year = "1998", volume = "24", number = "1", pages = "63--78", month = jan, OPTnote = "", annote = "Refinement of Arora's theory of closure and convergence \cite{Arora:1993:CCF}: the ability to tolerate certain kinds of faults is added to a system in a stepwise manner by adding detectors, that can detect invalidation of safety, and correctors, that re-estabilish liveness. By adding these components, care must be taken, that they do not interfere with eachother. The application example developed in the paper is a multitolerant token ring protocol. The model used is the serial model. The difficulties of extending it to message passing models is not discussed." } @InProceedings{Arora:1998:DCT, author = "Anish Arora and Sandeep S. Kulkarni", title = "Detectors and Correctors: A theory of fault-tolerance components", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", OPTpages = "", booktitle = pro-icdcs98, year = "1998", OPTorganization = "", OPTpublisher = "", OPTaddress = "", month = may, OPTnote = "", annote = "A compact presentation and discussion of \cite{Arora:1998:CDM}." } @Article{Arora:1998:DMF, author = "Anish Arora and Sandeep S. Kulkarni", title = "Designing masking fault tolerance via nonmasking fault tolerance", OPTcrossref = "", OPTkey = "", journal = j-IEEE-TRANS-SOFTW-ENG, year = "1998", volume = "24", number = "6", OPTpages = "", month = jun, annote = "A paper in the line of Arora's theory of correctors and detectors \cite{Arora:1998:CDM}. A fault intolerant program is transformed into a non-masking fault tolerant program by adding correctors and then transformed into a masking fault tolerant program by adding detectors. Detectors inhibit normal program actions when invalidation of the safety predicate is observed. Thus the program only takes ``safe'' steps. Application examples include Byzantine agreement, reliable data transfer, mutual exclusion." } @InProceedings{Arora:1998:SFC, author = "Anish Arora and Paul C. Attie and E. Allen Emerson", title = "Synthesis of fault-tolerant concurrent programs", pages = "173--182", booktitle = pro-podc98, year = "1998", OPTorganization = "", OPTpublisher = "", OPTaddress = "", OPTmonth = "", OPTnote = "", annote = "Based on a synthesis method for concurrent programs by Emerson and Clarke, this paper extends the possibilities to synthesize fault-tolerant programs that can tolerate a certain fault class. Faults are modelled as state transitions with a possibly extended state space, and recovery transitions are used to tolerate these faults. The method is based on temporal logic specifications. As examples, solutions to mutual exclusion and barrier synchronization are synthesized." } @InProceedings{Asokan:1998:APO, author = {N. Asokan and Victor Shoup and Michael Waidner}, title = {Asynchronous protocols for optimistic fair exchange}, booktitle = {Proceedings of the IEEE Symposium on Research in Security and Privacy}, OPTcrossref = {}, OPTkey = {}, pages = {86--99}, year = {1998}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, url = "http://www.zurich.ibm.com/Technology/Security/publications/1998/ASW98.ps.gz", month = may, OPTorganization = {}, OPTpublisher = {}, note = {Printed version contains some errors. Errata sheet is distributed together with the electronic version.}, annote = {[to read]} } @InProceedings{Asokan:1998:OFE, author = "N. Asokan and Victor Shoup and Michael Waidner", title = "Optimistic Fair Exchange of Digital Signatures", pages = "591--606", note = "A longer version is available as Technical Report RZ 2973 (\#93019), IBM Research, November 1997 at http://www.zurich.ibm.com/Technology/Security/publications/1997/ASW97b.ps.gz", booktitle = "EuroCrypt 98", year = "1998", publisher = pub-SV, editor = "Kaisa Nyberg", series = ser-LNCS, annote = "[to read]" } @Misc{Autexier:1998:VSE, OPTkey = {}, author = {Serge Autexier and Dieter Hutter and Bruno Langenstein and Heiko Mantel and Georg Rock and Axel Schairer and Werner Stephan and Roland Vogt and Andreas Wolpers}, title = {VSE: {Formal} methods meet industrial needs}, howpublished = {\url{http://www.dfki.uni-sb.de/vse/papers/ahlm98.ps.gz}}, OPTmonth = {}, year = {1998}, OPTnote = {}, annote = {[to read] announced to appear in Software Tools for Technology Transfer, 1998, Springer, Special issue on mechanized theorem proving for technology. Contains case study on ROBERTINO robot control system.} } @InProceedings{Beauquier:1998:TFD, author = {Joffroy Beauquier and Sylvie {Dela\"et} and Shlomi Dolev and {S\'ebastien} Tixeuil}, title = {Transient fault detectors}, booktitle = {Proceedings of the 12th International Symposium on DIStributed Computing (DISC'98)}, OPTcrossref = {}, OPTkey = {}, pages = {62--74}, year = {1998}, OPTeditor = {}, OPTvolume = {}, number = {1499}, series = ser-LNCS, address = {Andros, Greece}, month = sep, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {[to read]} } @Article{Belli:1998:MHS, author = {Fevzi Belli}, title = {{Methoden und Hilfsmittel f\"ur die systematische Pr\"ufung komplexer Software}}, journal = j-IS, year = {1998}, OPTkey = {}, volume = {21}, number = {6}, pages = {337--346}, month = dec, OPTnote = {}, annote = {Vorstellung von konventionellen Testmethoden und Testwerkzeuge, Reviews etc.} } @MastersThesis{Bendrath:1998:CNR, author = {Ralf Bendrath}, title = {{Computer und die neue Rolle des Milit\"ars in den USA}}, school = {Freie Universit\"at Berlin, Fachbereich Politische Wissenschaft}, year = {1998}, OPTkey = {}, type = {Diploma thesis (in German)}, OPTaddress = {}, month = aug, OPTnote = {}, annote = {Eine sehr detailiierte und quellenreiche Arbeit ueber den Einluss von Computern auf das Verhaeltnis zwischen Militaer und der Zivilgesellschaft. Eingegangen wird auf die neue Rolle des Soldaten im Krieg (Vernetzung, Integration von Strategie und Taktik), Automatisierung der Verarbeitung von militaerischen Daten (KI als Schluesseltechnologie und deren Gefahren), der Begriff des Information Warfare (Ausweitung computermilitaerischer Operationen auf den zivilen Bereich, zunehmende Ununterscheidbarkeit von militaerische und zivilen Operationen durch Praevention), Probleme des Sicherheitsbegriffs.} } @InProceedings{Chandra:1998:HFF, author = "S. Chandra and P.M. Chen", title = "How fail-stop are faulty programs?", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "240--249", booktitle = pro-ftcs98, year = "1998", OPTorganization = "", OPTpublisher = "", OPTaddress = "", month = jun, OPTnote = "", annote = "see proceedings" } @Article{Chase:1998:DGP, author = {Craig M. Chase and Vijay K. Garg}, title = {Detection of global predicates: Techniques and their limitations}, journal = j-DC, year = {1998}, OPTkey = {}, volume = {11}, number = {4}, pages = {191--201}, OPTmonth = {}, OPTnote = {}, abstract = {We show that the problem of predicate detection in distributed systems is NP-complete. In the past, efficient algorithms have been developed for special classes of predicates such as stable predicates, observer independent predicates, and conjunctive predicates. We introduce a class of predicates, semi-linear predicates, which properly contains all of the above classes. We first discuss stable, observer independent and semi-linear classes of predicates and their relationships with each other. We also study closure properties of these classes with respect to conjunction and disjunction. Finally, we discuss algorithms for detection of predicates in these classes. We provide a non-deterministic detection algorithm for each class of predicate. We show that each class can be equivalently characterized by the degree of non-determinism present in the algorithm. Stable predicates are defined as those that can be detected by an algorithm with the most non-determinism. All other classes can be derived by appropriately constraining the non-determinism in this algorithm.}, annote = {[to read]} } @InProceedings{Cristian:1998:TAS, author = "Flaviu Cristian and Cristof Fetzer", title = "The timed asynchronous distributed system model", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "140--149", booktitle = pro-ftcs98, year = "1998", OPTorganization = "", OPTpublisher = "", OPTaddress = "", month = jun, OPTnote = "", annote = "The authors present a formal definition of a system model that is claimed to capture the current behavior of distributed systems like the Internet. The model makes the following assumptions: (1) processes have hardware clocks that have bounded drift rate, (2) processes communicate via unreliable datagram service with broadcast facility that has omission/performance failure semantics, (3) processes have crash/performance failure semantics, (4) there is no bound on load or failure rate, (5) services are usually timed, i.e., their specification prescribes a time interval within which some transitions will occur. Together with the optional extensions of stable storage and progress assumptions the authors claim that this model adequately reflects todays ``reality'', since important problems (like consensus etc.) are solvable in the Internet. The model also caters for network partitions (they are modeled by sufficiently many crash/omission failures). The notion of a bounded drift rate is sufficient to implement a failure detector that detects untimeliness of processing or responses and thus can be used to build fail-aware services \cite{Fetzer:1997:FAA}." } @Article{Echtle:1998:FMB, author = {Klaus Echtle and {Jo\~ao} Gabriel Silva}, title = {{Fehlerinjektion -- ein Mittel zur Bewertung der Ma\ss{}nahmen gegen Fehler in komplexen Rechnersystemen}}, journal = j-IS, year = {1998}, OPTkey = {}, volume = {21}, number = {6}, pages = {328--336}, month = dec, OPTnote = {}, annote = {Empirische Verl\"asslichkeitsbewertung im Gegensatz zu analytischer (vgl. \cite{Thurner:1998:VKS}). Englische Referenz ist \cite{Hsueh:1997:FIT}. Die Autoren beleuchten alle wesentlichen Aspekte moderner Fehlerinjektionstechniken und beschreiben die Zusammenhaenge zu anderen Gebieten der Informatik. Z.B. die N\"ahe der Fehlerinjektion zum normalen Software-Test und zur formalen Verifikation. Letztere st\"o\ss{}t aber oft an Leistungsgrenzen, w\"ahrend Fehlerinjektion fast immer einen gegebenen Aufwandsrahmen ausf\"ullen kann. Zun\"achst werden Techniken der physikalischen Fehlerinjektion (Einwirkung auf Pins, Bestrahlung durch Schwerionen oder elektromagnetische Strahlung) und software-implementierte Fehlerinjektion besprochen. Letztere unterscheidet sich in Injektion auf der Komponenten-Ebene (direkte Ver\"anderung des Codes, direkte \"Anderung von Variablen, Programmz\"ahler oder Register) und auf der System-Ebene (Abschw\"achung von Annahmen \"uber andere unabh\"angige Prozesse an der Nachrichtenschnittstelle). Die Verl\"a\ss{}lichkeitsbewertung und die Fehlererfassung (engl. coverage) geschieht dann durch Auswahl geeigneter, realistischen Fehlerszenarien und einer ausreichenden Anzahl von Experimenten. Abschlie\ss{}end wird auf den Test von Fehlertoleranzverfahren in verteilten Systemen eingegangen: Der Begriff des Fehlerbereiches wird eingef\"uhrt um die \"ublichen Fehlermodelle (crash, Byzantine, etc.) zu beschreiben. Fehler k\"onnen dann an der Nachrichtenschnittstelle injiziert werden. Im Gegensatz zur formalen Verifikation bietet diese Technik den Vorteil, da\ss{} das System in einer ``realen'' Umgebung getestet wird. Ein paar g\"angige Fehlerinjektoren werden vorgestellt. Insgesamt ein guter \"Uberblick mit einer Art Markt\"ubersicht \"uber Fehlerinjektoren. Verwiesen wird bei der formalen Verifikation und Fehlermodellierung auf \cite{Echtle:1984:FSV}.} } @InProceedings{Gaertner:1998:EFR, author = {Felix C. {G\"artner} and Henning Pagnia}, title = "Enhancing the fault tolerance of replication: another excercise in constrained convergence", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "29--30", booktitle = pro-ftcs98-fastabs, year = "1998", OPTorganization = "", OPTpublisher = "", OPTaddress = "", month = jun, OPTnote = "", OPTannote = "" } @InProceedings{Fetzer:1998:MCM, author = {Christof Fetzer}, title = {The Message Classification Model}, booktitle = {Proceedings of the 17th ACM Symposium on Principles of Distributed Computing}, year = {1998}, month = jun, address = {Puerto Vallarta, Mexico}, url = {http://www.research.att.com/~christof/MCM}, abstract = " We propose a new system model for asynchronous distributed systems that we call the message classification model. Motivation for this model is its ability 1) to support a restricted but useful form of ``communication by time'' by classiying messages as either ``slow'' or ``fast'' but without incorporating neither real-time clocks nor ``time-outs'', and 2) to describe transient and permanent network partitions. The message classification model allows the definition of different classes of classification schemes. To show that the model is indeed useful, we show how one can solve the consensus and the election problem for a certain class of message classification schemes.", annote = " Contains a good overview and comparison of different models [to read]" } @TechReport{Gaertner:1998:FFT, author = "Felix C. {G\"artner}", title = "Fundamentals of fault tolerant distributed computing in asynchronous environments", institution = "Darmstadt University of Technology", year = "1998", number = "TUD-BS-1998-02", address = "Darmstadt, Germany", month = jul, url = "http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-1998-02.ps", note = "To appear in \textit{ACM Computing Surveys}, 31(1), March 1999.", annote = "A generalization of Arora and Kulkarni's theory of correction and detection \cite{Arora:1998:CDM} for the asynchronous message passing model. The paper first defines formally important terms like redundancy, fault and fault tolerance. Then it shows that fault tolerance cannot be achieved without redundancy and reveals the two phases necessary in fault tolerance: detection and correction. Detection is generalized to possibility detection in distributed systems and correction is generalized to imposing a predicate on the system. Fundamental methodologies of fault tolerant distributed computing (like fail stop processors, state machine approach, consensus) are shown to fit nicely into the framework." } @TechReport{Gaertner:1998:SFT, author = "Felix C. {G\"artner}", title = "Specifications for Fault Tolerance: {A} Comedy of Failures", institution = "Darmstadt University of Technology", year = "1998", number = "TUD-BS-1998-03", address = "Darmstadt, Germany", month = oct, url = "http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-1998-03.ps.gz", annote = "[to write]" } @Article{Gamache:1998:WCS, author = {Rod Gamache and Rob Short and Mike Massa}, title = {Windows {NT} clustering service}, journal = j-IEEE-COMPUTER, year = 1998, OPTkey = {}, volume = 31, number = 10, pages = "55--62", month = oct, OPTnote = {}, annote = {A colourful article that praises the clustering service for high availability in NT 5.0. A service may be implemented on a cluster of servers (i.e., a set of identical machines) that all together transparently provide the service as if one single server were present. Hardware and software failures can be detected and failed applications can be restarted on other machines without interrupting the overall mode of operation. Several issues have not been touched yet because of ``technical complexity or schedule pressures'': these are active replication, process pairs, primary-backup, non-stop migration of processes and recovery of shared state between client and server. ``They will be added to future versions of the product.''} } @InProceedings{Garg:1998:DPD, author = "Vijay K. Garg and J. Roger Mitchell", title = "Distributed predicate detection in a faulty environment", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", OPTpages = "", booktitle = pro-icdcs98, year = "1998", OPTorganization = "", OPTpublisher = "", OPTaddress = "", OPTmonth = "", OPTnote = "", annote = "The first real reference on general predicate detection in faulty environments. Several issues in this area are discussed: what type of failure detectors must be used? What if they produce false suspicions? How does this affect the validity of the global predicate? The authors present an algorithm to reliably detect a subclass of general predicates in an asycnhronous message-passing environment subject to process crashes, message loss and channel crashes. The type of predicates they detect are set-decreasing and conjunctive. Set-decreasing means that whenever it holds for a set $S$ of processes, then it also holds for a set $S'\subseteq S$. Conjunctive means that it can be written as the conjunction of local predicates and send-monotonic channel predictas. Send-monotonic channel predicates are those that if it is false, merely sending messages can't make it true. The algorithm is based on one by Hurfin, Mizuno, Raynal and Singhal \cite{Hurfin:1996:ODC} for detecting conjunctions of local predicates. Every process acts as a Monitor process and control messages are piggybacked on application messages. The application must ensure that eventually every process sends a message to every neighbour. Predicate detection is performed by constructing the lattice of consistent global states starting from an ``earliest'' state at every process. Nodes that are suspected to have failed are not inspected for predicate evaluation. This is okay for this special type of predicates. The failure detector used satisfies weak completeness and infinitely often accuracy, meaning that every correct process is never permanently suspected. This is a weaker failure detector than the ``eventually weak'' failure detector of \cite{Chandra:1996:UFD}." } @InProceedings{Garg:1998:IFD, author = {Vijay K. Garg and J. Roger Mitchell}, title = {Implementable failure detectors in asynchronous systems}, booktitle = {Proc. 18th Conference on Foundations of Software Technology and Theoretical Computer Science}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {1998}, OPTeditor = {V. Arvind and R. Ramanujin}, OPTvolume = {}, number = {1530}, series = ser-LNCS, address = {Chennai, India}, month = dec, OPTorganization = {}, publisher = pub-SV, url = "\url{http://maple.ece.utexas.edu/TechReports/1998/TR-PDS-1998-004.ps.Z}", OPTnote = {}, OPTannote = {} } @Book{Gertler:1998:FDD, author = {J. Gertler}, ALTeditor = {}, title = {Fault Detection and Diagnosis in Engineering Systems}, publisher = {Marcel Dekker}, year = {1998}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {New York}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {[Angabe von Armin]} } @Article{Grosspietsch:1998:FKN, author = {Karl-Erwin {Gro\ss{}pietsch} and Erik Maehle}, title = {{Fehlerbehandlung in komplexen nebenl\"aufigen Systemen}}, journal = j-IS, year = {1998}, OPTkey = {}, volume = {21}, number = {6}, pages = {347--355}, month = dec, OPTnote = {}, annote = {Konzentriert sich auf Fehlertoleranzmassnahmen zur Wahrung von bestimmten Systemtopologien (Array, Baum, etc.). Stichworte: dynamische Redundanz, Rekonfiguration, fehlertolerantes Routing, Recovery.} } @InCollection{Hohl:1998:TLB, author = {F. Hohl}, title = {Time Limited Blackbox Security: Protecting Mobile Agents from Malicious Hosts}, booktitle = {Mobile Agents and Security}, crossref = {Vigna:1998:MAS}, pages = {92--113}, annote = "referenz von Uwe Wilhelm" } @Book{Hoffmann:1998:DMD, author = {Paul Hoffmann}, title = {{Der Mann, der die Zahlen liebte}}, publisher = {Ullstein}, year = {1998}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, annote = {Biographie von Paul Erd\"os.} } @InProceedings{Hurfin:1998:CAS, author = {Michel Hurfin and A. {Most\'efaoui} and M. Raynal}, title = {Consensus in asynchronous systems where processes can crash and recover}, booktitle = {Proceedings of the 17th IEEE Symposium on Reliable Distributed Systems (SRDS'98)}, OPTcrossref = {}, OPTkey = {}, pages = {280--286}, year = {1998}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {West Lafayette, Indiana}, month = oct, OPTorganization = {}, publisher = pub-IEEE, OPTnote = {}, annote = {Hinweis aus FG Fehlertolerierende Rechnersysteme Mitteilungen, Maerz 1999. Previously a Technical Report \cite{Hurfin:1997:CAS}.} } @InProceedings{Hutter:1998:VSE, author = {Dieter Hutter and Heiko Mantel and Georg Rock and Werner Stephan and Andreas Wolpers and Michael Balser and Wolfgang Reif and Gerhard Schellhorn and Kurt Stenzel}, title = {{VSE:} {Controlling} the Complexity in Formal Software Developments}, booktitle = {Proceedings of the International Workshop on Applied Formal Methods}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {1998}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Boppard, Germany}, OPTmonth = {}, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {overview over VSE-II.} } @Article{Kaiser:1998:EDV, author = {{J\"org} Kaiser and Edgar Nett}, title = {{Echtzeitverhalten in dynamischen, verteilten Systemen}}, journal = j-IS, year = {1998}, OPTkey = {}, volume = {21}, number = {6}, pages = {356--365}, month = dec, OPTnote = {}, annote = {Behandelt den Faktor ``Zeit'' in fehlertoleranten verteilten Systemen. Dabei sind zwei Aspekte von Bedutung: Kommunikation und Scheduling. Kommunikation in Echtzeitsystemen mu\ss{} (1) Vorhersagbarkeit und (2) Kooperation gew\"ahrleisten. (1) Vorhersagbarkeit bedeutet eine obere Schranke auf Nachrichtenverz\"ogerung und eine Garantie von Eigenschaften unter Spitzenlast. Bei ersterem benutzt man reservierungsbasierte Verfahren (TDMA, braucht globale Zeit) und token-basierte (token-Ring, braucht keine globale Zeit). Ungeeignet ist Ethernet, obwohl darauf auch andere Verfahren implementiert werden k\"onnen. ATM ist eine Mischform. Kommunikationsfehler entstehen in der Wertedom\"ane und der Zeitdom\"ane. Sie werden in Fehlersemantiken wie omission oder crash beschrieben und m\"ussen toleriert werden. (2) Kooperation bedeutet Ordnung auf Nachrichten und Mitgliedschaft. In Echtzeitsystemen mu\ss{} man irgendwie Zeit mitspezifizieren. Es kann eine globale, synchronisierte Zeit angenommen werden (synchrone, eng synchronisierte Systeme) oder es werden zeitliche Systemannahmen lokal \"uber einen Timeout realisiert (zeitgesteuerte, asynchrone (engl. timed asynchronous), lose synchronisierte Systeme). Im eng synchronisierten Fall gen\"ugt Nachrichtendiffusion bei ausreichender Redundanz. In asynchronen Systemen braucht man eine Best\"atigung. Unter Hinweis auf \cite{Fischer:1985:IDC} wird bemerkt, da\ss{} irgendwelche Zeitannahmen (und seien sie nur unzuverl\"assig \cite{Chandra:1996:UFD}) ben\"otigt werden, um Konsens zu erzielen, auf welche Nachrichten sich noch zu warten lohnt und welche verloren gingen. Diese Zeitannahmen werden als Gleichm\"a\ss{}igkeit (steadyness) und Laufzeitvarianz (tightness) bezeichnet \cite{Verissimo:1993:RTC}. Anschlie\ss{}end wird auf Schedulingverfahren eingegangen. Als Anwendungsbeispiel wird die GMD-Snake Roboterschlange beschrieben.} } @Article{Karat:1998:GRU, author = {Clare-Marie Karat}, title = {Guaranteeing Rights for the User}, journal = j-CACM, year = {1998}, OPTkey = {}, volume = {41}, number = {12}, pages = {29--31}, month = dec, OPTnote = {}, annote = {Contains a ``user's bill of rights'' containing such items as ``the user is always right'' and ``the user has the right to a system that performs exactly as promised''. This is meant as a challenge to the computer industry to change its current view and points to the problem that dependency on hard- and software tends to become bigger as maintaining personell and the industry are able to exploit their sole understanding of how things work.} } @PhdThesis{Kekkonen:1998:RFA, author = "Synn{\"o}ve Kekkonen", title = "{R\'esistance} aux {Fautes} dans les {Algorithmes} {R\'epartis}: {Auto-Stabilisation} et {Tol\'erance} aux {Fautes}", school = "{Universit\'e} de Paris-Sud, France", year = "1998", OPTcrossref = "", OPTkey = "", OPTaddress = "", OPTmonth = "", OPTtype = "", OPTnote = "", annote = "English title is: ``On Failure Resilience of Distributed Protocols: Self-Stabilization and Fault-Tolerance.'' {Synn\"ove} investigates the (im)possibility of achieving reliability in the presence of systemic and process failures much in the tradition of Anagnostou and Hadzilacos \cite{Anagnostou:1993:TTP}. Failures are modeled as state transitions in the tradition of Arora and Gouda \cite{Arora:1993:CCF} and there are hints to defining a failure model as a program ``augmentation''. The thesis is developed in three stages: first, there is an elaborate chapter on modelling distributed systems as transition systems and defining/proving fault tolerance properties on them. Second, the self-stabilization approach is used to build stabilizing failure detectors and to solve torus orientation in anonymous networks, where the non-terminating nature of the self-stabilization paradigm interfaces well with the impossibility of a terminating solution for the problem. Third, the possibility of simultaneous resilience to process and systemic failures is investigated. Kekkonen proves a main impossibility result: if a problem is $k$-fault-sensitive in an asynchronous $(j,k)$-restrictable network subject to $k>0$ process crashes, then there exists no $k$-fault-tolerant self-stabilizing solution to the problem. A network is $(j,k)$-restrictable if some subnetwork of $j$ nodes can be replaced by a network of $k$ nodes without changing the ``interface'' structure (e.g., replacing 5 successive nodes in a ring by a single one). A problem is $k$-fault-sensitive for a specific network if there is a $(j,k)$-restriction of the network and the protocol would reach different solutions depending on whether these $j$ processes are alive or $k$ processes have crashed. This is an extension of the result of Anagnostou and Hadzilacos \cite{Anagnostou:1993:TTP} and their notion of failure sensitivity. Examples of $k$-fault-sensitive problems are computing the size of a ring and the $c$ coloring problem on rings. Examples of fault-insensitive problems are unique naming, non-trivial eventual consensus and ring orientation. A heuristic for finding out whether a problem is fault-insensitive or not is assume that the problem can be solved, and then comparing the set of legitimate states of systems on different restrictions of the original network. If they do not differ, then the problem is fault-insensitive. Overall this is a very thorough and concise thesis, originally written and defended in french." } @InProceedings{Kreitz:1998:PED, author = "Christoph Kreitz and Mark Hayden and Jason Hickey", title = "A proof environment for the development of group communication systems", OPTcrossref = "", OPTkey = "", editor = "H. Kirchner", OPTvolume = "", OPTnumber = "", series = "Lecture Notes in AI", OPTpages = "", booktitle = "15th International Conference on Automated Deduction", year = "1998", OPTorganization = "", publisher = pub-SV, OPTaddress = "", OPTmonth = "", OPTnote = "", OPTannote = "Ensenble is a group communication environment in the tradition of Isis and written in OcaML, a language similar to ML and thus well suited to be manipulated with NuPRL. The authors show how to import Ensemble code into NuPRL, verify certain aspects of a specification and export the code again for execution. Fault-tolerance is added by using failure detectors and focus is put on safety requirements. Timed I/O automata are used as the basis for formal reasoning about distributed systems. [bibliographic data needs polish!]" } @Unpublished{Kreitz:1998:SWL, author = "Christoph Kreitz", title = "``{\textit{Safety}} ist wichtig, {\textit{liveness}} sieht man.''", note = "Personal communication.", OPTcrossref = "", OPTkey = "", year = "1998", month = mar, annote = "Annotation during a talk on the Ensemble system at TU Darmstadt, March 12th, 1998, concerning a proof of a safety property. Liveness was up to that time of no concern in the project \cite{Kreitz:1998:PED}. See also the paper on Ariane 5 \cite{Dega:1996:RMA}, which supports this claim." } @Article{Kshemkalyani:1998:NSC, author = "Kshemkalyani and Singhal", title = "Necessary and Sufficient Conditions on Information for Causal Message Ordering and their Optimal Implementation", journal = j-DC, volume = "11", pages = "91--111", year = "1998", annote = "[to read]" } @InProceedings{Lamport:1998:CWM, author = {Leslie Lamport}, title = {Composition: {A} way to make proofs harder}, booktitle = {Compositionality: The Significant Difference (Proceedings of the COMPOS'97 Symposium)}, OPTcrossref = {}, OPTkey = {}, pages = {402--423}, year = {1998}, editor = {Willem-Paul de Roever and Hans Langmaak and Amir Pnueli}, OPTvolume = {}, number = {1536}, series = ser-LNCS, OPTaddress = {}, OPTmonth = {}, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {Good title - good paper. Argues that compositionality makes proofs grow fast and that the additional effort is substantial if no automation is used. Gives an example.} } @TechReport{Liu:1998:SVF, author = {Zhiming Liu and Mathai Joseph}, title = {Specification and verification of fault-tolerance, timing and scheduling}, institution = {Department of Mathematics and Computer Science, University of Leicester, U.K.}, year = {1998}, OPTkey = {}, OPTtype = {}, number = {1998/5}, OPTaddress = {}, OPTmonth = {}, OPTnote = {}, annote = {Accepted at ACM TOPLAS. Extends earlier work of Liu and Joseph \cite{Liu:1992:TPF,Liu:1993:SVR,Liu:1994:SDF,Liu:1996:VFR} to transformational reasoning about fault-tolerant real-time systems. The formalism used is TLA and only safety properties of programs are considered. The computational model and the way to reason about fault tolerant systems using refinement is essentially the same as in \cite{Liu:1996:VFR,Liu:1992:TPF} but is nicely summarized and brought into the TLA framework. The main body of the paper deals with timing and scheduling. These isuues are introduced into the formalism by introducing lower and upper time bounds to actions and adding a real time clock (similar to \cite{Abadi:1994:OFR}). The global fault assumption must be extended to specify minimum time lengths in which faults are not repeated and by assuming that the scheduler is not subject to faults. Much interest is laid on timing feasability meaning whether there exists a scheduler to schedule a program correctly. It is shown how to reason compositionally about programs combined with schedulers, thus abstracting away from any specific implementation or policy. This is exemplified by taking a fixed priority scheduling scheme from the literature and using it to show feasibility. Discussion of related work mostly covers scheduling work, while initial historical remarks also deal with formal methods in fault tolerance. A very good paper; can be seen as quintessenz of Liu and Joseph's work over the last decade.} } @Article{Marcopulos:1998:FBC, author = "Ted Marcopulos", title = "Faster, better, cheaper space exploration", OPTcrossref = "", OPTkey = "", journal = "IEEE Spectrum", year = "1998", volume = "34", number = "8", pages = "68--74", month = aug, OPTnote = "", annote = "The author surveys NASAs recent attempts to apply commercial management and development schemes to their current space exploration programs. It turns out that there is a strive towards eliminating redundancy in large parts of the system because components are already reliable enough for unmanned space flight and redundancy is costly both in weight, dollars and software/hardware complexity. This is a good reference together with \cite{Dega:1996:RMA}." } @Article{Marcus:1998:WTD, author = {Stephen J. Marcus}, title = {What to do about bolts from the blue}, journal = j-IEEE-COMPUTER, year = 1998, OPTkey = {}, volume = 35, number = 12, pages = "34--41", month = dec, OPTnote = {}, annote = {Fascinating report on the danger of the earth being hit by an asteroid and the issues involved. A large scale example of being able to tolerate severe faults by detection and correction.} } @InProceedings{Merritt:1998:FSO, author = "Michael Merritt and Gadi Taubenfeld", title = "Fairness of Shared Objects", booktitle = {Proceedings of the 12th International Symposium on DIStributed Computing (DISC'98)}, pages = "303--316", year = "1998", series = ser-LNCS, number = "1499", month = sep, address = "Andros, Greece", annote = "Here, fairness is not defined with respect to processes or schedulers, but with respect to accesses to distinct shared objects. This is a way of encapsulating fairness assumptions (and thus timing assumptions) into modules quite nicely. Four types of fair objects are considered: dedalock-free (if some process tries to access some object, eventually some process will succeed to access that object), starvation-free (if a process tries to access an object, then he will eventually succeed), bounded-waiting (deadlock-free and there is an (unknown?) upper bound $r$ on the number of times that some other process can access an object before another process wanting to access the object), $r$-bounded-waiting (deadlock-free and there is a fixed upper bound $r$ on the number of times other processes can succeed before myself). It turns out that deadlock-free objects are weaker than starvation-free objects (using starvation-free objects makes some problems solvable), starvation-free and bounded-waiting objects are ``similar'' and $r$-bounded-waiting objects are much stronger than bounded-waiting objects. A nice result shows that safety properties are immune to fairness assumptions (similar result is attributed to \cite{Alur:1997:TAA}. There's a good related work section discussing the relationship between time/fairness and system models." } @InProceedings{Riordan:1998:CEP, author = {J. Riordan and B. Schneier}, title = {A Certified E-Mail Protocol with No Trusted Third Party}, booktitle = {Proceedings of the 13th Annual Computer Security Applications Conference}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {1998}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, month = dec, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {Presents the same protocol as \cite{Pagnia:1997:TMP} and relates it to the current Internet infrastructure. Get it at: http://www.counterpane.com/certified-email.html} } @InProceedings{Rothermel:1998:FPP, author = "Kurt Rothermel and Markus Stra{\ss}er", year = "1998", title = "{A Fault-Tolerant Protocol for Providing the Exactly-Once Property of Mobile Agents}", booktitle = "Proc. 17th IEEE Symposium on Reliable Distributed Systems 1998 (SRDS'98)", publisher = "IEEE Computer Society Press", address = "Los Alamitos, California", pages = "100--108", annote = "[to read]" } @InCollection{Sander:1998:PMA, author = {T. Sander and C. F. Tschudin}, title = {Protecting Mobile Agents Against Malicious Hosts}, booktitle = {Mobile Agents and Security}, crossref = {Vigna:1998:MAS}, annote ="Angabe von Uwe Wilhelm" } @InProceedings{Sander:1998:TMC, author = "T. Sander and C. Tschudin", title = "Towards Mobile Cryptography", added-at = "Wed Apr 8 11:17:26 1998", abstract = "Mobile code technology has become a driving force for recent advances in distributed systems. The concept of mobility of executable code raises major security problems. In this paper we deal with the protection of mobile code from possibly malicious hosts. We conceptualize on the specific cryptographic problems posed by mobile code. We are able to provide a solution for some of these problems: We present techniques how to achieve ``non--interactive computing with encrypted programs'' in certain cases and give a complete solution for this problem in important instances. We further present a way how a agent might securely perform a cryptographic primitive, digital signing, in an untrusted execution environment. Our results are based on the use of homomorphic encryption schemes and function composition techniques.", online = "http://www.icsi.berkeley.edu/~tschudin/ps/ieee-sp98.ps.gz", booktitle = "Proceedings of the {IEEE} Symposium on Research in Security and Privacy", address = "Oakland, CA", year = "1998", publisher = pub-IEEE, month = may, OPTorganization = "{IEEE} Computer Society, Technical Committee on Security and Privacy", annote = "interesting paper doing a significant step towards protecting mobile code from it's host without requiring trusted hardware. currently only solutions for rationals/polynomial functions are outlined (but not yet for boolean circuits (equivalent to turing machines !)) and there is also still a need for secure birational functions to make the ideas work.", } @InProceedings{Schneider:1998:FAN, author = "Steve Schneider", title = "Formal Analysis of a Non-Repudiation Protocol", booktitle = "PCSFW: Proceedings of The 11th Computer Security Foundations Workshop", publisher = "IEEE Computer Society Press", year = "1998", pages = "54--65", annote = "The author presents a formal analysis of Zhou/Gollmann fair non-repudiation protocol \cite{Zhou:1996:FNP} (which is in fact similar to the protocol of \cite{Pagnia:1999:EGP}). The formalism used is CSP \cite{Hoare:1984:CSP}. Apart from the rigor in which the protocol is modeled and proved, an interesting fact here is that the author also stumbles over the necessity of liveness in the specification (an aspect discussed in \cite{Pagnia:1999:IFE}): state can be ``imposed'' on a process by assuring that it is able to make a state change if the process wants to. This is formalized as the following liveness property: if process A wants to make a state change depending on the receipt of message m from the trusted third party, then A will eventually receive m. This implies that the trusted third party is continuously available and has m ready and waiting for delivery to A. In this paper, A queries the trusted third party. Consequently, reliable communication to the trusted authority must be assumed. Another interesting point is a `generates' relation between messages which is used in the proof. This reminds of the formalization of non-cooperative Byzantine faults \cite{Echtle:1999:UCB}." } @Article{Schneier:1998:CDV, author = {Bruce Schneier}, title = {Cryptographic design vulnerabilities}, journal = j-COMPUTER, year = {1998}, OPTkey = {}, volume = {31}, number = {9}, pages = {29--33}, month = sep, OPTnote = {}, annote = {Briefly discusses the notions of detection and correction in the context of cryptography and security.} } @InProceedings{Siegel:1998:FVS, author = {Michael Siegel}, title = {Formal verification of stabilizing systems}, booktitle = {Proceedings of the 5th International Symposium on Formal Techniques in Real Time and Fault Tolerant Systems (FTRTFTS'98)}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {1998}, editor = {Anders P. Ravn and Hans Rischel}, OPTvolume = {}, number = {1486}, series = ser-LNCS, address = {Lyngby, Denmark}, month = sep, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {Describes a calculus to perform formal proofs of stabilizing algorithms. The environment are fair transitions systems and temporal logic. Gives proof rules for composing and refining stabilizing systems.} } @InProceedings{Singhai:1998:SFI, author = "Ashish Singhai and Swee-Boon Lim and Sanjay R. Radia", title = "The {SunSCALR} framework for Internet Servers", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "108--117", booktitle = pro-ftcs98, year = "1998", OPTorganization = "", OPTpublisher = "", OPTaddress = "", month = jun, OPTnote = "", annote = "First available implementation of a self-stabilizing algorithm in an industrial product. Also an example for the applications of non-masking fault tolerance." } @inproceedings{Stoller:1998:ASB, author = "Scott D. Stoller and Fred B. Schneider", title = "Automated Stream-Based Analysis of Fault-Tolerance", booktitle = "Formal Techniques in Real-Time and Fault-Tolerant Systems", publisher = pub-SV, series = ser-LNCS, volume=1486, pages="113-122", year=1998, month=sep, address="Lyngby, Denmark", url = "\url{http://ftp.cs.indiana.edu/pub/stoller/FTRTFT98-extended.ps.gz}", annote = "[to read]" } @InProceedings{Tarafdar:1998:AFC, author = {Ashis Tarafdar and Vijay K. Garg}, title = {Addressing false causality while detecting predicates in distributed programs}, booktitle = pro-icdcs98, OPTcrossref = {}, OPTkey = {}, pages = {94--101}, year = {1998}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Amsterdam, The Netherlands}, OPTmonth = {}, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, url = "http://www.ece.utexas.edu/~garg/dist/dcs98-ashis.ps.Z", annote = {The general causality relation based on the happened before relation imposes a causal order on events which may be causally unrelated. For example, two successive events on one process are causally ordered by deault, but this order may simply have been imposed by a random scheduler and the events are in fact events on two independent threads of the process. This is called `false causality' and has been a critique of the happened-before model of distributed computations. The authors present a way of extending the partial order model to ``split up'' the execution oder of a process into multiple threads and actually treating independent events as such in the causality relation. This adds complexity to the problem and they show that it becomes NP complete (the original problem is NP complete already \cite{Chase:1998:DGP}). However, for a restricted class of predicates (weak conjunctive ones) they give an efficient algorithm to detect them. States that approaches to predicate detection fall into three classes: (1) snapshot based ones \cite{Chandy:1985:DSD} only suitable for stable predicates, (2) lattice construction based ones \cite{Cooper:1991:CDG} and (3) restriction based approaches like those of Garg.} } @InProceedings{Theel:1998:OPS, author = "Oliver Theel and Felix C. {G\"artner}", title = "On proving the stability of distributed algorithms: self-stabilization vs. control theory", OPTcrossref = "", OPTkey = "", editor = "Vladimir B. Bajic", volume = "III", OPTnumber = "", OPTseries = "", pages = "58--66", booktitle = "Proceedings of the International Systems, Signals, Control, Computers Conference (SSCC'98), Durban, South Africa", year = "1998", OPTorganization = "", OPTpublisher = "", OPTaddress = "", month = sep, note = "", annote = "[to write ;-)]" } @Article{Thurner:1998:VKS, author = {Erwin Thurner and Mario Dal Cin and Winfried {Schneewei\ss{}}}, title = {{Verl\"a\ss{}lichkeitsbewertung komplexer Systeme}}, journal = j-IS, year = {1998}, OPTkey = {}, volume = {21}, number = {6}, pages = {318--327}, month = dec, OPTnote = {}, annote = {Deutsche Einf\"uhrung in Begriffe wie Zuverl\"assigkeit, mittlere Lebensdauer (MTTF), Ausfallrate, Sicherheit, MTBF, Verf\"ugbarkeit, sowie die Methoden Fehlerb\"aume, Markovketten und hybride Ans\"atze. Konzentration auf analytische Bewertungen, nicht auf experimentelle (f\"ur experimentelle siehe \cite{Echtle:1998:FMB}).} } @Book{Vigna:1998:MAS, editor = {G. Vigna}, title = {Mobile Agents and Security}, publisher = pub-SV, year = 1998, volume = 1419, series = ser-LNCS, address = {Berlin}, annote = "Angabe von Uwe Wilhelm" } @InProceedings{Voelzer:1998:VFT, author = "Hagen {V\"olzer}", title = "Verifying fault tolerance of distributed algorithms formally: {An} example", OPTcrossref = "", OPTkey = "", OPTeditor = "", OPTvolume = "", OPTnumber = "", OPTseries = "", pages = "187--197", booktitle = "Proceedings of the International Conference on Application of Concurrency to System Design (CSD98)", year = "1998", OPTorganization = "", publisher = pub-IEEE, address = "Fukushima, Japan", month = mar, OPTnote = "", annote = "This paper investigates the fully mechanical verification of fault tolerant algorithms using the DAWN approach \cite{Weber:1997:DAW} which is based on Petri nets. The main point in doing so is to formally handle faults and fault models. This is done by distinguishing an (informal) fault model from a formal fault impact model specified by a Petri net. In this example, crash and omission faults are formalized by additional state transitions which are superimposed onto an algorithm for fault free executions. Additionally to a fault impact model, a ``rely'' property belongs to the fault model. Such a property formalizes ``assumptions about the environment'' like the maximum number of faults that may occur, and it makes these assumptions exploitable by a proof. The example algorithm used is the SELF-2 fault diagnosis algorithm by Kuhl and Reddy. The paper shows the advantages of Petri nets in formulating and reasoning about distributed algorithms. The superimposition property of such nets make the approach extremenly usefull for fault tolerant algorithms." } @inproceedings{Wilhelm:1998:PTM, year = {1998}, title = {On the Problem of Trust in Mobile Agent Systems}, author = {U. G. Wilhelm and L. Butty\`an and S. Staamann}, booktitle = {Symposium on Network and Distributed System Security}, publisher = {Internet Society}, keywords = {IMPORTANT; Security}, month = mar, pages = "114--124", annote = "[to read]" } @Article{Aguilera:1999:UHF, author = "Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg", title = "Using the heartbeat failure detector for quiescent reliable communication and consensus in partitionable networks", journal = "Theoretical Computer Science", volume = "220", number = "1", pages = "3--30", day = "06", month = jun, year = "1999", coden = "TCSCDI", ISSN = "0304-3975", bibdate = "Mon Jul 19 22:22:41 MDT 1999", url = "http://www.elsevier.com/cas/tree/store/tcs/sub/1999/220/1/3045.pdf", acknowledgement = ack-nhfb, annote = "[to read]" } @TechReport{Aguilera:1999:WFD, year = "1999", number = "TR99-1741", institution = "Cornell University, Computer Science", title = "On the Weakest Failure Detector for Uniform Reliable Broadcast", author = "Marcos Kawazoe Aguilera and Sam Toueg and Borislav Deianov", abstract = "Uniform Reliable Broadcast (URB) is a communication primitive that requires that if a process delivers a message, then all correct processes also deliver this message. A recent PODC paper \cite{Halpern:1999:KAU} uses Knowledge Theory to determine what failure detectors are necessary to implement this primitive in asynchronous systems with process crashes and lossy links that are fair. In this paper, we revisit this problem using a different approach, and provide a result that is simpler, more intuitive, and, in a precise sense, more general.", month = apr # " 30,", annote = "" } @Article{Benassi:1999:T, author = "Paola Benassi", title = "{TRUSTe}: An online privacy seal program", journal = "Communications of the ACM", volume = "42", number = "2", pages = "56--59", month = feb, year = "1999", coden = "CACMA2", ISSN = "0001-0782", bibdate = "Fri Feb 5 07:01:55 MST 1999", url = "http://www.acm.org:80/pubs/citations/journals/cacm/1999-42-2/p56-benassi/", acknowledgement = ack-nhfb, annote = "TRUSTe is a trustmark or a seal which providers can put onto their web pages in order to indicate sound privacy practices. The trustmark is issued by a organization which checks the pages (\url{www.truste.org}). See also \cite{Reagle:1999:PPP}." } @Article{Billinghurst:1999:WDN, author = "Mark Billinghurst and Thad Starner", title = "Wearable Devices: New Ways to Manage Information", journal = "Computer", volume = "32", number = "1", pages = "57--64", month = jan, year = "1999", coden = "CPTRB4", ISSN = "0018-9162", bibdate = "Fri Jan 15 16:17:58 MST 1999", url = "http://www.computer.org/computer/co1999/r1057abs.htm; http://dlib.computer.org/co/books/co1999/pdf/r1057.pdf", acknowledgement = ack-nhfb, annote = "A thrilling and fascinating article on a somewhat underestimated branch of computer science. Computers can be incorporated into clothing, eyeglasses, can be worn around the neck, in a wristwatch, etc. Applications of wearable computers (also non-military) are given: navigation using augmented reality, wearable bar code scanners at UPS. The article also takes a shot at predicting what comes next: for example using augmented reality to do conferencing. Pointers to conferences, companies and research projects concerning wearables round up the article. For a market survey as of 2000 see \cite{Ditlea:2000:PCG}." } @Article{Boyle:1999:DYT, author = "James M. Boyle and R. Daniel Resler and Victor L. Winter", title = "Do You Trust Your Compiler?", journal = "Computer", volume = "32", number = "5", pages = "65--73", month = may, year = "1999", url = "http://www.computer.org/computer/co1999/r5065abs.htm; http://dlib.computer.org/co/books/co1999/pdf/r5065.pdf", annote = "There are two problems involved when using formal methods to produce correct software: (1) coming up with an accurate formal specification of the problem, and (2) producing a correct implementation of the specification bzw. verifying that a given implementation is correct regarding the specification. This paper addresses the second problem and uses buggy compilers to motivate it. Bugs in compilers are well-documented (see news:gnu.gcc.bug for example). The idea is to start with a high level code and apply correctness preserving transformations to it until a lower level code is reached. Denotational sematics are used to define `correctness preserving'. As an open research problem it is noted that producing code from safety and liveness specifications would be good." } @InProceedings{Cardellini:1999:RAL, author = {Valeria Cardellini and Michele Colajanni and Philip S. Yu}, title = {Redirection Algorithms for Load Sharing in Distributed Web-server Systems}, booktitle = pro-icdcs99, OPTcrossref = {}, OPTkey = {}, pages = {528--535}, year = {1999}, editor = {Mohamed G. Gouda}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, month = {May/June}, OPTorganization = {}, publisher = pub-IEEE, OPTnote = {}, annote = {[to read]} } @Article{Carreira:1999:FIS, author = {Jo\~ao Viegas Carreira and Diamantino Costa and Jo\~ao Gabriel Silva}, title = {Fault injection spot-checks computer system dependability}, journal = {IEEE Spectrum}, year = {1999}, OPTkey = {}, volume = {36}, number = {8}, pages = {50--55}, month = aug, OPTnote = {}, annote = {A good motivation and introduction to fault injection from a more hardware point of view than \cite{Hsueh:1997:FIT,Echtle:1998:FMB}. Contains terms Heisenbugs (a failure that is not reconstructable), and Bohrbugs (the opposite). Like \cite{Rushby:1994:CSP} states that attaching reliability figures to a system is poblematic, even if the failure model is precisely fixed. States that there is research in Sematech, HP, Cpmpaq and Stanford to collect real fault data and thus enable more realistic failure models.} } @Article{Cristian:1999:TAD, author = "Flaviu Cristian and Christof Fetzer", title = "The Timed Asynchronous Distributed System Model", journal = "{IEEE} Transactions on Parallel and Distributed Systems", year = "1999", volume = "10", number = "6", month = jun, url = "http://www-cse.ucsd.edu/users/cfetzer/MODEL/", abstract = "We propose a formal definition for the timed asynchronous distributed system model. We present extensive measurements of actual message and process scheduling delays and hardware clock drifts. These measurements confirm that this model adequately describes current distributed systems such as a network of workstations. We also give an explanation of why practically needed services, such as consensus or leader election, which are not implementable in the time-free model, are implementable in the timed asynchronous system model.", language = "English", annote = "A revised version of \cite{Cristian:1998:TAS}." } @InProceedings{Echtle:1999:UCB, author = {Klaus Echtle and Asif Masum}, title = {Understanding Cooperative Byzantine Failures: A Novel Failure Classification to Enable Efficient Fault-Tolerant Protocols}, booktitle = {Proceedings of the Annual IEEE Workshop on Fault-Tolerant Parallel and Distributed Systems (FTPDS'99)}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {1999}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {San Juan, Puerto Rico, USA}, month = apr, OPTorganization = {}, publisher = {Kluwer}, OPTnote = {}, annote = {The authors present a unifying approach to modeling the common fault classes formally. This is the most general approach that I know of and in contrast to \cite{Gaertner:1998:SFT} seems to be easier to adapt to the common fault classes (and is able to derive new ones). The work contains a near-to-complete list of references to fault classification work and puts the terms fault, error and failure in a nice layered context (p. 2). The model consists of a set of $n$ components that can be seperated into fault free and faulty ones. Components communicate by sending messages from some fixed message set. Sending and receipt of a message trigger events. An event is a tuple consisting of the message, the event type (send/receive) and a time tag, which specifies the global point in continuous real time in which the event occurs. Component behaviors can now be described as event sets, which through the time tag implicitly define a single (?!) sequence of events (not a set of sequences?). A specification $S_i$ for component $i$ is a set of correct input/output tuples, i.e. a relation over input sequences and output sequences. Failure modes are defined in a functional way: a failure mode identifies sets of behaviors which a component may exhibit following the occurence of a set of receive events. Now it is possible to define the different ``failure mode functions'' for correct behavior, fail-silent, fail-omission, message loss, message duplication etc. by changing tags in message sets or message sets themselves. To define failures affecting code integrity (e.g. altered messages) the authors define the concept of a failure capability $C_i$ for component $i$. This can be seen as a degraded component specification, i.e. is the set of behaviors allowed by $i$ if it is faulty. Using this construct it is possible to derive a rich set of distinctive failure modes visualized in Fig. 10. As a further novelty, the authors introduce a new failure mode, that of non-cooperative Byzantine. This is where no malicious cooperation takes place between faulty nodes. This is formalized along the idea that such behavior must be based on either (1) malicious treason (e.g., revealing a secret key) or (2) malicious delegation (e.g. some node asks another node to sign a message). Malicious cooperation is then defined (on p. 19) as ``increasing the failure capabilty by the receipt of a message'' (see also the `generates' relation of \cite{Schneider:1998:FAN}). Non-cooperative behavior is defined as the complement of malicious cooperation. It is nice to have different types of Byzantine behaviors because this can result in protocols that are more efficient. This is shown by example. Overall a formal, but very rewarding paper which can also be used as an overview over the state of the art in failure classification. See also \cite{Echtle:2000:FFM} and Asif's thesis.} } @InProceedings{Essame:1999:PPA, author = {Didier Essame and Jean Arlat and David Powell}, title = {Padre: {A} protocol for asymmetric duplex redundancy}, booktitle = {Proceedings of the Seventh IFIP International Working Conference on Dependable Computing for Critical Applications}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {1999}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {San Jose, USA}, month = jan, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {[to get] uses the timed asynchronous model to build a fully automated train control system, cited in \cite{Cristian:1999:TAD}.} } @InProceedings{Felber:1999:FDF, year = "1999", address = "Edinburgh, Scotland", pages = "132--141", title = "Failure Detectors as First Class Objects", author = "Pascal Felber and Xavier D\'efago and Rachid Guerraoui and P. Oser", booktitle = "Proceedings of the International Symposium on Distributed Objects and Applications (DOA'99)", month = sep, annote = "[to get]" } @Article{Felber:1999:POD, author = "Pascal Felber and Rachid Guerraoui and Mohamed E. Fayad", title = "Putting {OO} distributed programming to work", journal = "Communications of the ACM", volume = "42", number = "11", pages = "97--101", month = nov, year = "1999", url = "http://www.acm.org/pubs/articles/journals/cacm/1999-42-11/p97-felber/p97-felber.pdf; http://www.acm.org/pubs/citations/journals/cacm/1999-42-11/p97-felber/", annote = "Discusses different approaches to specify, model and implement failure detectors. Distinguishes the push model, pull model and the dual model (combination of push and pull). Similar title is \cite{Felber:1999:FDF}. Failure detector implementations also discussed in \cite{Sergent:1999:FDI}." } @InProceedings{Fetzer:1999:CTA, author = {Christof Fetzer}, title = {A comparison of timed asynchronous systems and asynchronous systems with failure detectors}, booktitle = {Proceedings of the Third European Research Seminar on Advances in Distributed Systems (ERSADS'99)}, OPTcrossref = {}, OPTkey = {}, pages = {109--118}, year = {1999}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Madeira Island, Portugal}, month = apr, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {[to write]} } @Article{Gabber:1999:CYA, author = "Eran Gabber and Phillip B. Gibbons and David M. Kristol and Yossi Matias and Alain Mayer", title = "Consistent, yet anonymous, {Web} access with {LPWA}", journal = "Communications of the ACM", volume = "42", number = "2", pages = "42--47", month = feb, year = "1999", coden = "CACMA2", ISSN = "0001-0782", bibdate = "Fri Feb 5 07:01:55 MST 1999", url = "http://www.acm.org:80/pubs/citations/journals/cacm/1999-42-2/p42-gabber/", acknowledgement = ack-nhfb, annote = "The LPWA is the Lucent Personalized Web Assistant, a tool which helps you manage different pseudonyms and thus manage anonymity on the web. Related articles are about Crowds \cite{Reiter:1999:AWT}, onion routing \cite{Goldschlag:1999:OR}, and \cite{Reagle:1999:PPP,Benassi:1999:T}." } @InProceedings{Gaertner:1999:AFD, author = {Felix C. {G\"artner} and Henning Pagnia and Holger Vogt}, title = {Approaching a formal definition of fairness in electronic commerce}, booktitle = {Proceedings of the International Workshop on Electronic Commerce (WELCOM'99)}, OPTcrossref = {}, OPTkey = {}, pages = {354--359}, year = {1999}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Lausanne, Switzerland}, month = oct, OPTorganization = {}, publisher = pub-IEEE, OPTnote = {}, annote = {[to write]} } @Unpublished{Gaertner:1999:DR, author = {Felix C. {G\"artner} and Hagen {V\"olzer}}, title = {Defining Redundancy in Fault-Tolerant Computing}, note = {unpublished manuscript}, OPTkey = {}, OPTmonth = {}, year = {1999}, OPTannote = {} } @InProceedings{Gaertner:1999:ESD, author = {Felix C. {G\"artner}}, title = {An exercise in systematically deriving fault-tolerance specifications}, booktitle = {Proceedings of the Third European Research Seminar on Advances in Distributed Systems (ERSADS)}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {1999}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Madeira Island, Portugal}, month = apr, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {Shorter Version of \cite{Gaertner:1999:ESDFS}.} } @TechReport{Gaertner:1999:ESDFS, author = {Felix C. G\"artner}, title = {An exercise in systematically deriving fault-tolerance specifications}, institution = {Department of Computer Science, Darmstadt University of Technology}, year = {1999}, OPTkey = {}, OPTtype = {}, number = {TUD-BS-1999-01}, address = {Darmstadt, Germany}, month = mar, OPTnote = {Available at http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-1999-01.ps.gz}, annote = {} } @Article{Gaertner:1999:FFT, author = {Felix C. {G\"artner}}, title = {Fundamentals of fault-tolerant distributed computing in asynchronous environments}, journal = j-ACM-COMP-SURVEYS, year = {1999}, OPTkey = {}, volume = {31}, number = {1}, pages = {1--26}, month = mar, OPTnote = {}, annote = {updated version of \cite{Gaertner:1998:FFT}.} } @TechReport{Gaertner:1999:FUF, author = {Felix C. {G\"artner} and Armin Wolfram}, title = {{Fehlererkennung und Fehlerdiagnose f\"ur verl\"a\ss{}liche Systeme -- Automatisierungstechnik vs.~verteilte Systeme}}, institution = {Department of Computer Science, Darmstadt University of Technology}, year = {1999}, OPTkey = {}, OPTtype = {}, number = {TUD-BS-1999-03}, address = {Darmstadt, Germany}, month = jul, OPTnote = {}, OPTannote = {} } @InProceedings{Gaertner:1999:SLD, author = {Felix C. {G\"artner} and Henning Pagnia}, title = {Self-stabilizing Load Distribution for Replicated Servers on a Per-Access Basis}, booktitle = pro-wss99, OPTcrossref = {}, OPTkey = {}, pages = {102--109}, year = {1999}, editor = {Anish Arora}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Austin, TX}, month = jun, OPTorganization = {}, publisher = pub-IEEE, OPTnote = {}, annote = {A self-stabilizing extension to existing load balancing schemes (such as \cite{Arora:1997:OCC,Arora:1995:ECC,Gronning:1990:SDD}) to allow fine grained load distribution based on redirection. Pointers to commercial realizations appear in \cite{Cardellini:1999:RAL}.} } @TechReport{Gaertner:1999:STA, author = {Felix C. {G\"artner}}, title = {A survey of transformational approaches to the specification and verification of fault-tolerant systems}, institution = {Department of Computer Science, Darmstadt University of Technology}, year = {1999}, OPTkey = {}, OPTtype = {}, number = {TUD-BS-1999-04}, address = {Darmstadt, Germany}, month = apr, note = {To appear in \textit{Journal of Universal Computer Science} (J.UCS), special issue on ``Dependability Evaluation and Assessment'' (November, 1999).}, OPTannote = {Journal version \cite{Gaertner:1999:TAS}.} } @Article{Gaertner:1999:TAS, author = {Felix C. {G\"artner}}, title = {Transformational Approaches to the Specification and Verification of Fault-Tolerant Systems: {Formal} Background and Classification}, journal = {Journal of Universal Computer Science (J.UCS)}, year = {1999}, OPTkey = {}, volume = {5}, number = {10}, pages = {668--692}, month = oct, note = {Special Issue on Dependability Evaluation and Assessment}, annote = {Prior technical report \cite{Gaertner:1999:STA}.} } @Article{Glass:1999:RST, author = "Robert L. Glass", title = "The realities of software technology payoffs", journal = "Communications of the ACM", volume = "42", number = "2", pages = "74--79", month = feb, year = "1999", coden = "CACMA2", ISSN = "0001-0782", bibdate = "Fri Feb 5 07:01:55 MST 1999", url = "http://www.acm.org:80/pubs/citations/journals/cacm/1999-42-2/p74-glass/", acknowledgement = ack-nhfb, annote = "Glass studies which new software engineering practices have turned out to pay off in the long run. These techologies are: structured techniques, fourth generation languages, CASE, formal methods, cleanroom methodology, process models, object-orientation. Especially interesting to me is the discussion of formal methods. Glass says that it has been little used because it still is largely underdefined and underevaluated. Only one study has brought forward hard numbers \cite{Ralston:1991:FMH}." } @Article{Goldschlag:1999:OR, author = "David Goldschlag and Michael Reed and Paul Syverson", title = "Onion routing", journal = "Communications of the ACM", volume = "42", number = "2", pages = "39--41", month = feb, year = "1999", coden = "CACMA2", ISSN = "0001-0782", bibdate = "Fri Feb 5 07:01:55 MST 1999", url = "http://www.acm.org:80/pubs/citations/journals/cacm/1999-42-2/p39-goldschlag/", acknowledgement = ack-nhfb, annote = "See the \url{www.onion-router.net}. Other methods to achieve privacy on the net are discussed in other articles from this CACM issue \cite{Reiter:1999:AWT,Gabber:1999:CYA,Reagle:1999:PPP} and \cite{Benassi:1999:T}" } @Article{Grimley:1999:PIA, author = {Michael J. Grimley and Brian D. Monroe}, title = {Protecting the integrity of agents: {An} exploration into letting agents loose in an unpredictable world}, journal = {Crossroads - The ACM Student Magazine}, year = {1999}, OPTkey = {}, OPTvolume = {}, number = {5.4}, pages = {10--17}, month = {}, OPTnote = {}, annote = {A good and brief surver introduction into the issues of security of agents (both protecting agents from their execution environments and vice versa, with lots of good references. A good staring point.} } @InProceedings{Halpern:1999:KAU, author = {Joseph Y. Halpern and Aleta Ricciardi}, title = {A knowledge-theoretic analysis of uniform distributed coordination and failure detectors}, booktitle = pro-podc99, OPTcrossref = {}, OPTkey = {}, pages = {73--82}, year = {1999}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTmonth = {}, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {A discussion appears in \cite{Aguilera:1999:WFD}.} } @Article{Hennessy:1999:FSR, author = "John Hennessy", title = "The Future of Systems Research", journal = "Computer", volume = "32", number = "8", pages = "27--33", month = aug, year = "1999", url = "http://www.computer.org/computer/co1999/r8027abs.htm; http://dlib.computer.org/co/books/co1999/pdf/r8027.pdf", annote = "A speculation on what will be and what should be the subject of research and development in systems in the next years. Interesting is that the author explicitly mentions availability as a key issue and fault-tolerance as a key mechanism. However, fault tolerance research must focus more on gradual and dynamic mechanisms, not directly hiding fault evidence but helping maintain availablity, for example like in the RAID approach \cite{Patterson:1988:CRA}. A good reference for the importance of fault tolerance research." } @Article{Hoffman:1999:PCL, author = {Forrest Hoffman and William Hargove}, title = {Parallel computing with {Linux}}, journal = {Crossroads, the ACM student magazine}, year = {1999}, OPTkey = {}, volume = {6}, number = {1}, pages = {23--27}, OPTmonth = {}, OPTnote = {}, annote = {Gives a practical guide to installing a beowulf parallel computing system at your home. Gives a lot of online references to more information and is a good starting point for beowulf projects.} } @Article{Hurfin:1999:SFA, author = "Michel Hurfin and Michel Raynal", title = "A Simple and Fast Asynchronous Consensus Protocol Based on a Weak Failure Detector", journal = j-DC, volume = "12", number = "4", pages = "209--223", year = "1999", abstract = "The Consensus problem is a fundamental paradigm for fault-tolerant asynchronous systems. It abstracts a family of problems known as Agreement (or Coordination) problems. Any solution to consensus can serve as a basic building block for solving such problems (e.g., atomic commitment or atomic broadcast). Solving consensus in an asynchronous system is not a trivial task: it has been proven (1985) by Fischer, Lynch and Paterson that there is no deterministic solution in asynchronous systems which are subject to even a single crash failure. To circumvent this impossibility result, Chandra and Toueg have introduced the concept of unreliable failure detectors (1991), and have studied how these failure detectors can be used to solve consensus in asynchronous systems with crash failures. This paper presents a new consensus protocol that uses a failure detector of the class $\Diamond{\cal S}$. Like previous protocols, it is based on the rotating coordinator paradigm and proceeds in asynchronous rounds. Simplicity and efficiency are the main characteristics of this protocol. From a performance point of view, the protocol is particularly efficient when, whether failures occur or not, the underlying failure detector makes no mistake (a common case in practice). From a design point of view, the protocol is based on the combination of three simple mechanisms: a voting mechanism, a small finite state automaton which manages the behavior of each process, and the possibility for a process to change its mind during a round.", annote = "Must be noted as one of the standard consensus protocols amoung \cite{Chandra:1996:UFD} and \cite{Schiper:1997:ECA,Schiper:1997:EEC}." } @Article{Jajodia:1999:SIW, author = "Sushil Jajodia and Paul Ammann and Catherine D. McCollum", title = "Surviving Information Warfare Attacks", journal = "Computer", volume = "32", number = "4", pages = "57--63", month = apr, year = "1999", coden = "CPTRB4", ISSN = "0018-9162", bibdate = "Thu Apr 1 07:09:15 MST 1999", url = "http://www.computer.org/computer/co1999/r4057abs.htm; http://dlib.computer.org/co/books/co1999/pdf/r4057.pdf", annote = "Describes the dangers which information systems are suspect to and the traditional methods of preventing them (fault tolerance, database system management mechanisms). A realistic alternative to these two approaches is described that is a mixture of both, attacks and countermeasures are briefly described. While the exact mechanisms remain rather superficial, this paper is another example for the fact that security can also be seen as a fault tolerance problem (\cite{Arora:1998:DMF} is cited directly) with all the implications. See also \cite{Schneier:1998:CDV}." } @InProceedings{Jochim:1999:AGD, author = {Markus Jochim}, title = {Automatic Generation of Diversified Program Variants Optimized to Detect Hardware Faults}, booktitle = {Tenth European Workshop on Dependable Computing (EWDC-10)}, OPTcrossref = {}, OPTkey = {}, pages = {169--174}, year = {1999}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Vienna, Austria}, OPTmonth = {}, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, url = "http://www.cs.uni-essen.de/Fachgebiete/Depend/Papers/Joch99/", annote = {Presents ideas on how to automatically introduce code diversity into machine programs so that two distinct but semantically equivalent processes can run in parallel (virtual duplex system) and detect hardware errors with high probability. Discusses practical considerations in the design of code mutation rules like independence of addressing mode, overflow, short code production etc.} } @InProceedings{Johansen:1999:NAP, author = "Dag Johansen and Keith Marzullo and Fred B. Schneider and Kjetil Jacobsen and Dmitrii Zagorodnov", title = "{NAP}: Practical Fault-Tolerance for Itinerant Computations", booktitle = {Proceedings of the 19th IEEE International Conference on Distributed Computing Systems}, OPTcrossref = {}, OPTkey = {}, pages = {180--189}, year = {1999}, editor = {Mohamed G. Gouda}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Austin, Texas}, month = jun, OPTorganization = {}, publisher = pub-IEEE, OPTnote = {}, OPTannote = {} } @Article{Jutla:1999:MBS, author = "Dawn Jutla and Peter Bodorik and Catherine Hajnal and Charles Davis", title = "Making Business Sense of Electronic Commerce", journal = "Computer", volume = "32", number = "3", pages = "67--75", month = mar, year = "1999", coden = "CPTRB4", ISSN = "0018-9162", bibdate = "Sat Mar 6 09:04:10 MST 1999", url = "http://www.computer.org/computer/co1999/r3067abs.htm; http://dlib.computer.org/co/books/co1999/pdf/r3067.pdf", acknowledgement = ack-nhfb, annote = "A good overview over the issues involved in adoption and applying e-commerce in different fields of buisiness. Buisiness models and application frameworks are presented." } @Article{Karaata:1999:SAB, author = "Mehmet Hakan Karaata and Pranay Chaudhuri", title = "A self-stabilizing algorithm for bridge finding", journal = j-DC, volume = "12", year = "1999", pages = "47--53", annote = "Finds edges which partition the graph if they are removed. Builds upon spanning tree algorithm by \cite{Huang:1992:SSA}." } @Article{Kelley:1999:HTB, author = {Robert E. Kelley}, title = {How to be a star engineer}, journal = {IEEE Spectrum}, year = {1999}, OPTkey = {}, volume = {36}, number = {10}, pages = {51--58}, month = oct, OPTnote = {}, annote = {Reports on a study about engineer work performance and discusses many misconceptions. Argues that star performers are normal workers who are treated in a special way. keyword: Soft skills, also for managers.} } @MastersThesis{Kloppenburg:1999:EPS, author = {Sven Kloppenburg}, title = {Entdecken globaler {Pr\"adikate} in verteilten Systemen mit {Anhalteausf\"allen}}, school = {Technische Universit\"at Darmstadt, Fachbereich Informatik, Fachgebiet Betriebssysteme}, year = {1999}, OPTkey = {}, type = {Diplomarbeit}, OPTaddress = {}, month = sep, note = {DA-BS-1999-02}, annote = {Results published in \cite{Gaertner:2000:CDG}. A cite for the term ``Anhalteausfall'', German for ``crash''.} } @PhdThesis{Kulkarni:1999:CBD, author = {Sandeep S. Kulkarni}, title = {Component Based Design of Fault-Tolerance}, school = {Department of Computer and Information Science, The Ohio State University}, year = {1999}, OPTkey = {}, OPTtype = {}, OPTaddress = {}, OPTmonth = {}, OPTnote = {}, annote = {Several papers contain results of this thesis, e.g. \cite{Arora:1998:CDM}.} } @InProceedings{Kulkarni:1999:CSC, author = {Sandeep S. Kulkarni and John Rushby and Natarajan Shankar}, title = {A Case-Study in Component-Based Mechanical Verification of Fault-Tolerant Programs}, booktitle = pro-wss99, OPTcrossref = {}, OPTkey = {}, pages = {33--40}, year = {1999}, editor = {Anish Arora}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Austin, TX, USA}, month = jun, OPTorganization = {}, publisher = pub-IEEE, OPTnote = {}, OPTannote = {} } @Article{Lange:1999:SGR, title = "{Seven good reasons for mobile agents}", author = "Danny B. Lange and Mitsuru Oshima", journal = "Communications of the ACM", volume = "42", number = "3", month = mar, year = "1999", pages = "88--89", url = "http://www.acm.org/pubs/articles/journals/cacm/1999-42-3/p88-lange/p88-lange.pdf", annote = "While the title states otherwise, the reasons presented here are to me rather non-reasons: 1. they reduce the network load, 2. they overcome network latency, 3. they encapsulate protocols, 4. they execute asynchronously and autonomously, 5. they adapt dynamically, 6. they are naturally heterogeneous, 7. they are robust and fault-tolerant. I find the way in which the individual reasons are presented very non-convincing, probably because the exposition is so brief. Some applications of agents are given (e-commerce, personal assiatance, secure brokering, distributed information retrieval, ...)." } @Article{Lewis:1999:BCM, author = "Ted Lewis", title = "Binary Critic: Mainframes Are Dead, Long Live Mainframes", journal = "Computer", volume = "32", number = "8", pages = "104, 102--103", month = aug, year = "1999", url = "http://dlib.computer.org/co/books/co1999/pdf/r8104.pdf", annote = "Argues that mainframes are experiencing a revival because of their unmatched reliability. Gives some figures: Cost of downtime ranges from \$1000 per minute for simple e-mail to \$13000 per minute for enterprise resource planning applications. Also: An IBM S/390 sysplexed mainframe only has 10 minutes outage per year, while a windows-NT-based PC has about 224.5 hours outage per year (table 1)." } @TechReport{Mantel:1999:CSM, author = {Heiko Mantel and Felix C. {G\"artner}}, title = {A case study in the mechanical verification of fault tolerance}, institution = {Department of Computer Science, Darmstadt University of Technology }, year = {1999}, OPTkey = {}, OPTtype = {}, number = {TUD-BS-1999-08}, OPTaddress = {}, month = nov, OPTnote = {}, OPTannote = {} } @InProceedings{Mostefaoui:1999:SCU, author = {Achour Mostefaoui and Michel Raynal}, title = {Solving Consensus Using Chandra-Toueg's Unreliable Failure Detectors: a General Quorum-Based Approach}, booktitle = {Proceedings of the 13th International Symposium on Distributed Computing (DISC)}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {1999}, OPTeditor = {}, OPTvolume = {}, number = {1693}, series = ser-LNCS, address = {Bratislava, Slovak Republik}, month = sep, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {Uses dynamic quorums to define when a value may be decided.} } @Article{Oberg:1999:WMP, author = "James Oberg", journal = "IEEE Spectrum", number = "12", pages = "34--39", title = "Why the Mars Probe went off course", volume = "36", year = "1999", crindex = "Journal", location = "http://www.spectrum.ieee.org/spectrum/dec99/features/mars.html", annote = "A detailed report on why the mars climate orbiter crashed onto the surface of Mars in 1999. Popularly believed to be only an error in taking metric and British measurement units, the article shows that the orbiter failed to follow the right trajectory also partly because of severe management mistakes and sensor inaccuracies: Uncertainty lead to assuming good things instead of bad things, so instead of a safe fly-by the orbiter must have crashed onto the surface of Mars (even that is not sure)." } @InProceedings{Pagnia:1999:EGP, author = {Henning Pagnia and Holger Vogt}, title = {Exchanging goods and payment in electronic business transactions}, booktitle = {Proceedings of the Third European Research Seminar on Advances in Distributed Systems (ERSADS)}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {1999}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Madeira Island, Portugal}, month = apr, OPTorganization = {}, OPTpublisher = {}, note = {Proceedings distributed as copies at the conference.}, annote = {similar to \cite{Vogt:1999:FAE} but in English; I have an electronic copy in literature/pagnia-ersads.ps A similar protocol has appeared in \cite{Zhou:1996:FNP}, a shorter presentation is \cite{Schneider:1998:FAN}.} } @TechReport{Pagnia:1999:IFE, author = {Henning Pagnia and Felix C. {G\"artner}}, title = {On the impossibility of fair exchange without a trusted third party}, institution = {Darmstadt University of Technology, Department of Computer Science}, year = {1999}, OPTkey = {}, OPTtype = {}, number = {TUD-BS-1999-02}, address = {Darmstadt, Germany}, month = mar, url = {\url{http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-1999-02.ps.gz}}, note = {Available at \url{http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-1999-02.ps.gz}. A substantially revised version is available upon request from the authors.}, OPTannote = {} } @InProceedings{Pedone:1999:GB, year = "1999", title = "Generic Broadcast", author = "F. Pedone and A. Schiper", booktitle = "Proceedings of the 13th International Symposium on Distributed Computing (DISC'99)", month = sep, url = "http://lsewww.epfl.ch/Documents/acrobat/PS99c.pdf", annote = "see also \cite{Aguilera:2000:TGB} [to get]" } @Article{Reagle:1999:PPP, author = "Joseph Reagle and Lorrie Faith Cranor", title = "The platform for privacy preferences", journal = "Communications of the ACM", volume = "42", number = "2", pages = "48--55", month = feb, year = "1999", coden = "CACMA2", ISSN = "0001-0782", bibdate = "Fri Feb 5 07:01:55 MST 1999", url = "http://www.acm.org:80/pubs/citations/journals/cacm/1999-42-2/p48-reagle/", acknowledgement = ack-nhfb, annote = "PPP is a way of formally stating privacy policies within web pages and making privacy practices compareable and automatically manageable. related work ist the TRUSTe seal \cite{Benassi:1999:T}." } @Article{Reicherzer:1999:AUA, author = {Judith Reicherzer}, title = {{Angeklickt und abgezockt}}, journal = {Die Zeit}, year = {1999}, OPTkey = {}, OPTvolume = {}, number = {34}, pages = {20--21}, month = "19.~" # aug, OPTnote = {}, annote = {Gute Motivation fuer die Notwendigkeit von Fair Exchange.} } @Article{Reiter:1999:AWT, author = "Michael K. Reiter and Aviel D. Rubin", title = "Anonymous {Web} transactions with crowds", journal = "Communications of the ACM", volume = "42", number = "2", pages = "32--48", month = feb, year = "1999", coden = "CACMA2", ISSN = "0001-0782", bibdate = "Fri Feb 5 07:01:55 MST 1999", url = "http://www.acm.org:80/pubs/citations/journals/cacm/1999-42-2/p32-reiter/", acknowledgement = ack-nhfb, annote = "One of the prominent projects to achieve anonymity on the web. The approach of crowds uses a nondeterministic forwarding service between clients within a crowd. A web server receiving a request cannot know whether the request originated from the sender or from some other member of the crowd. The concept can even provide privacy against a number of collaborating members of the crowds itself. Disadvantages of crowds are (amoung others) the increased retrieval latency, and having to protect the confidentiality of the message against other crowd members. Compared against the anonymizer fo example, crowds has no single point where provacy can be compromised. Crowds has been implemented and deployed in the US. Some practical issues are also discussed and references to research papers are given. Other methods to achieve privacy are onion routing \cite{Goldschlag:1999:OR}, anonymizer (\url{www.anonymizer.com}), LPWA \cite{Gabber:1999:CYA}. Relevant other articles are \cite{Reagle:1999:PPP,Benassi:1999:T}." } @InCollection{Roth:1999:MPC, author = {V. Roth}, title = {Mutual Protection of Co-operating Agents}, booktitle = "Secure Internet Programming: Security Issues for Mobile and Distributed Objects", pages = "277--287", crossref = "Vitek:1999:SIP", annote = "ref von Uwe Wilhelm" } @Misc{Semper:1999:ASA, OPTkey = {}, OPTauthor = {}, editor = "SEMPER Consortium and IBM {Z\"urich}", title = {Advanced Services, Architecture and Design}, howpublished = {SEMPER Deliverable D10; La Gaude}, month = mar, year = {1999}, note = {Available at http://www.semper.org/deliver/d10/d10.ps.gz}, annote = {Part of the final report on the SEMPER project.} } @TechReport{Sergent:1999:FDI, author = {Nicole Sergent and Xavier {D\'efago} and {Andr\'e} Schiper}, title = {Failure Detectors: implementation issues and impact on consensus performance}, institution = {{\'Ecole} Polytechnique {F\'ed\'erale} de Lausanne, Switzerland}, year = {1999}, OPTkey = {}, OPTtype = {}, number = {SSC/1999/019}, OPTaddress = {}, OPTmonth = {}, OPTnote = {}, annote = {This paper presents several different ways to implement crash failure detectors and measures the impact of these implementations on the performance of the Chandra Toueg Consensus algorithm \cite{Chandra:1996:UFD}. The different implementations are: heart beat (a node periodically sends `alive' messages), interrogation (nodes keep exchanging `are you alive', `alive' messages), and two optimizations: use only critical messages to do request response type failure detection, sending heart beats only between critical requests/respones. The simulation of the consensus algorithm shows that the time out used to implement suspicions together with the period interval of sending failure detector messages have optimal combinations regarding the termination time of the algorithm. It is argued that using failure detectors does not relieve the engineer to consider timing issues (also indicated by \cite{Fetzer:1999:CTA}).} } @InProceedings{Theel:1999:EPC, author = {Oliver Theel and Felix C. {G\"artner}}, title = {An Exercise in Proving Convergence through Transfer Functions}, booktitle = pro-wss99, OPTcrossref = {}, OPTkey = {}, pages = {41--47}, year = {1999}, editor = {Anish Arora}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Austin, TX}, month = jun, OPTorganization = {}, publisher = pub-IEEE, OPTnote = {}, annote = {A simpler example than in \cite{Theel:1998:OPS}, still not distributed, but from an algorithms viewpoint.} } @InProceedings{Theel:1999:OPT, author = {Oliver Theel and Felix C. {G\"artner}}, title = {On proving termination through transfer functions}, booktitle = {Proceedings of the 4th International Workshop on Termination}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {1999}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Dagstuhl, Germany}, month = may, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {} } @InCollection{Verissimo:1999:TDS, author = {Paulo Ver{\'\i}ssimo and Michel Raynal}, title = {Time in distributed system models and algorithms}, booktitle = {Advances in Distributed Systems, Part I -- Distributed Algorithms}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, publisher = {ESPRIT Broadcast, Springer-Verlag}, year = {1999}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTtype = {}, OPTchapter = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, note = {to appear, available at \url{http://www.navigators.di.fc.ul.pt/archive/TimeBcast.ps.gz}}, annote = {Summarizes a great deal of work already published elsewhere. First briefly sketches the controversy synchrony vs. asynchrony and states that today, timeliness constraints in terms of real-time are increasingly important, especially in dependable systems (flight control) or QoS applications. This leads to the quasi synchronous system model, which is then briefly elaborated on (for a more detailed explanation, see \cite{Almeida:1998:QSA}). Timing failure detectors (as generalizations of crash failure detectors \cite{Chandra:1996:UFD}) are presented, motivated and implemented in the quasi synchronous setting. Timing failure detectors are complete in a safety sense (i.e., they detect timing failures within a known real-time bound). Such failure detectors can be generalized to QoS failure detectors. Then the CesiumSpray system for global clock synchronozation is presented (a hierachical and hybrid one to exploit the characteristics of different WAN/LAN settings), then follow some generalizations of causal or temporal precendence orders which also take events outside of the system into account and try to order them (I did not read that too carefully). Finally, some protocols to achieve such order are presented.} } @Book{Vitek:1999:SIP, editor = "J. Vitek and C. Jensen", title = "Secure Internet Programming: Security Issues for Mobile and Distributed Objects", volume = "1603", publisher = pub-SV, address = "New York, NY, USA", year = "1999", series = "Lecture Notes in Computer Science", keywords = "Computer security; Electronic data processing --- Distributed processing --- Security; Intelligent agents (Computer software) --- Security measures; measures; Mobile agents (Computer software)", } @InProceedings{Vogt:1999:FAE, author = {Holger Vogt and Henning Pagnia}, title = {{Fairer Austausch beim elektronischen Einkauf im Internet}}, booktitle = {Proceedings of the 6th DFN-CERT Workshop ``Sicherheit in vernetzten Systemen''}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {1999}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Hamburg, Germany}, month = mar, OPTorganization = {}, OPTpublisher = {}, OPTnote = {in German}, annote = {Ueberblick ueber Protokolle zum fairen Austausch von Ware gegen Geld in unsicheren Netzen wie dem Internet. Diskussion der Begriffe starker und schwacher Fairness von Asokan und der spaerlichen Literatur zu diesem Thema. Vorstellung einiger Protokolle zur starken Fairness mit Vermittlern: (1) mit aktivem Vermittler, (2) optimistisch mit generierbaren Waren, (3) optimistisch mit Zahlungswiderrufsmoeglichkeit, (4) optimistisch mit generierbarer Ware und Widerrufbarkeit. Am Ende Diskussion von Anonymitaet, die wenig Auswirkungen auf die vorgestellten Protokolle hat.} } @incollection{Wilhelm:1999:ITT, year = {1999}, address = {New York, NY, USA}, pages = {471--491}, series = {Lecture Notes in Computer Science}, title = {Introducing Trusted Third Parties to the Mobile Agent Paradigm}, author = {U. G. Wilhelm and S. Staamann and L. Butty\`an}, booktitle = {Secure Internet Programming: Security Issues for Mobile and Distributed Objects}, publisher = pub-SV, editor = {J. Vitek and C. Jensen}, keywords = {Security}, volume = {1603}, annote = "[got it?]" } @PhdThesis{Wilhelm:1999:TAP, author = {U. G. Wilhelm}, title = {A Technical Approach to Privacy based on Mobile Agents protected by Tamper-resistant Hardware}, school = {{\'E}cole Polytechnique F{\'e}d{\'e}rale de Lausanne}, year = 1999, address = {Switzerland}, number = {1961}, month = may } @Article{Aguilera:2000:FDC, author = "Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg", title = "Failure Detection and Consensus in the Crash Recovery Model", journal = "Distributed Computing", year = "2000", alt-url = "http://www.cs.cornell.edu/home/sam/FDpapers/crash-recovery-finaldcversion.ps", url = "http://link.springer.de/link/service/journals/00446/papers/0013002/00130099.pdf", pages = "99--125", volume = "13", number = "2", month = apr, abstract = "We study the problems of failure detection and consensus in asynchronous systems in which processes may crash and recover, and links may lose messages. We first propose new failure detectors that are particularly suitable to the crash-recovery model. We next determine under what conditions stable storage is necessary to solve consensus in this model. Using the new failure detectors, we give two consensus algorithms that match these conditions: one requires stable storage and the other does not. Both algorithms tolerate link failures and are particularly efficient in the runs that are most likely in practice - those with no failures or failure detector mistakes. In such runs, consensus is achieved within $3 \delta$ time and with 4 n messages, where $\delta$ is the maximum message delay and n is the number of processes in the system.", annote = "Description in \cite{Aguilera:1998:FDCTR}." } @Article{Aguilera:2000:QRC, author = {Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg}, title = {On quiescent reliable communication}, journal = {SIAM Journal on Computing}, year = {2000}, OPTkey = {}, volume = {29}, number = {6}, pages = {2040--2073}, month = dec, url = {\url{http://www.cs.cornell.edu/Info/People/sam/FDpapers/ACTquiescent-SIAM.ps}.}, annote = {Quiescent algorithms are those that eventually stop sending messages. Quiescent reliable communication protocols are algorithms like reliable broadcast or uniform reliable broadcast that are quiescent. The authors study quiescent reliable communication algorithms in systems where processes may crash and links are fair. A link is fair if it does not introduce spurious messages and if a message which is sent infinitely often is received infinitely often. In such systems it is impossible to implement quiescent reliable communication without failure detectors. Why? Reliable communication means that whenever nodes $s$ and $r$ are correct and $s$ sends a message to $r$, then $r$ must eventually receive the message. However, $s$ must achieve this by sending only finitely many messages. Any number of messages may be lost due to the fair channels, and so $s$ can never be sure whether $r$ has crashed or is alive if it does not receive an acknowledgement. Failure detection can help in this case. However, the usual failure detectors which output lists of suspects are not very useful. Any such bounded failure detector that helps solve quiescent reliable communication is at least as powerful as the eventually perfect failure detector. Why? The bound on the output of the failure detector implies that eventually it will keep on repeating the same (limit) values again and again. The existence of a quiescent communication primitive however implies that the limit value is in fact the set of correct processes. Thus, using this failure detector it is possible to emulate an eventually perfect failure detector. Next, the authors introduce a new type of failure detector called Heartbeat which has an unbounded output range. The range is a vector of elements (one for each process, or neighboring process) that keeps on increasing without bound as long as that process is alive. Thus, the failure detector can now be used to keep the system going. To achieve quiescence it is now possible to take a change in the heartbeat failure detector as the cause of a retransmission unless an acknowledgement has been received. In a sense, the decision whether to stop or not is transfered into the failure detector. Obviously, heartbeat is implementable in asynchronous systems (the authors give an implementation), and naturally, such an implementation cannot be quiescent. In systems where heartbeat is available quiescent reliable communication can be achieved and so fair links lose their danger: many algorithms that rely on reliable links can now be transformed into environments with lossy links (fair ones, not fair lossy \cite{Basu:1996:SRL}), whenever Heartbeat is available. It must however be checked whether reliable can be substituted with ``quasi-reliable'' communication (quasi-reliable is equal to reliable if processes do not crash during quasi-reliable sending). The concluding remarks touch some other interesting points: (1) message buffering can be limited by at some point excluding suspected processes from the active group (i.e. explicitly crashing them). The heartbeat implementation will however ensure that no messages are sent to them long before they are excluded. (2) a terminating protocol is quiescent, but a quiescent protocol need not terminate. A layering technique is proposed that has failure detection as a basic mechanism (non-quiescent, non-terminating), building upon failure detection is reliable communication (quiescent, non-terminating), and on top can be terminating applications like consensus. (3) fair lossy \cite{Basu:1996:SRL} is opposed to fair channels, stating that the results also hold for fair lossy links, only that expensive piggybacking is required in this case. (4) failure detectors with finite output range have limitations (this is obvious from the fact that quiescent reliable communication needs an eventually perfect failure detector if the output range is bounded and such a detector is impossible to implement in asynchronous systems). However, when comparing failure detectors it is necessary to see whether the transformation is quiescent too. } } @InProceedings{Aguilera:2000:TGB, author = {Marcos Kawazoe Aguilera and Carole Delporte-Gallet and Hugues Fauconnier and Sam Toueg}, title = {Thrifty generic broadcast}, booktitle = {Proceedings of the 14th International Symposium on Distributed Computing (DISC)}, OPTcrossref = {}, OPTkey = {}, pages = {268--282}, year = {2000}, OPTeditor = {}, OPTvolume = {}, number = {1914}, series = ser-LNCS, address = {Toledo, Spain}, month = oct, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {Looks at atomic broadcasts where the total order may be relaxed. Implementations of such operators can of course rely on atomic broadcast, but this is unsatisfactory. The scrictness property proposed by Pedone and Schiper (generic broadcast) is not sufficient. In this paper, new definitions for a broadcast to be a good implementation of generic broadcast are proposed. The definition is based on the notion of using an oracle like a failure detector. A generic broadcast implementation is good (=thrifty) if the implementation uses the oracle only when conflicting messages need to be processed (a more formal definition is: if there is a time after which only non-conflicting messages are brodcast, then there is a time after which the oracle is not used anymore). The oracle used is in fact atomic broadcast.} } @InProceedings{Arora:2000:RVC, author = "Anish Arora and Sandeep Kulkarni and Murat Demirbas", title = "Resettable vector clocks", booktitle = "Proceedings of the Nineteenth Annual ACM Symposium on Principles of Distributed Computing (PoDC)", pages = "269--278", year = "2000", annote = "Resettable vector clocks are vector clocks that use bounded state space. This paper identifies assumptions under which vector clocks may be replaced by resettable vector clocks in an application without endangering its correctness. Then resettable vector clocks are made stabilizing fault tolerant using detectors and correctors (a global reset is fired on local detection)." } @InCollection{Arora:2000:S, author = {Anish Arora}, title = {Stabilization}, booktitle = {Encyclopedia of Distributed Computing}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, publisher = {Kluwer}, year = {2000}, editor = {Partha Dasgupta and Joseph E. Urban}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTtype = {}, OPTchapter = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, url = "\url{ftp://ftp.cis.ohio-state.edu/pub/anish/papers/stb.ps.gz}", OPTnote = {}, OPTannote = {} } @Article{Bernhardt:2000:RDR, author = {Ute Bernhardt}, title = {{Reiten auf der Risikowelle (Editorial zum Sonderheft zum Thema ``Verletzlichkeit der Informationsgesellschaft'')}}, journal = {FIfF Kommunikation}, year = {2000}, OPTkey = {}, OPTvolume = {}, number = {3}, pages = {3}, month = sep, OPTnote = {}, annote = {Editorial zum Sonderheft. Im Sonderheft selbst sind ausnahmslos lesenswerte Artikel beispielsweise ueber kritische Infrastrukturen \cite{Schulzki:2000:KI}, Cybercime, Jugendschutz im Internet und Vertrauen. Interessant ist, dass etwa zur selben Zeit eine thematisch aehnliche Ausgabe von IEEE Computer erscheint \cite{Jones:2000:CBS}.} } @InProceedings{Boichat:2000:RBC, author = {Romain Boichat and Rachid Guerraoui}, title = {Reliable Broadcast in the Crash-Recovery Model}, booktitle = pro-srds2000, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {2000}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {N\"urnberg, Germany}, month = oct, OPTorganization = {}, publisher = pub-IEEE, OPTnote = {}, OPTannote = {} } @Article{Bowen:2000:ESC, author = {Jonathan Bowen}, title = {The ethics of safety-critical systems}, journal = j-CACM, year = {2000}, OPTkey = {}, OPTvolume = {43}, OPTnumber = {4}, OPTpages = {91--97}, OPTmonth = apr, OPTnote = {}, OPTannote = {Presents sins and truths of safety critical systems engineering. Explicitly discusses formal methods.} } @TechReport{Brasileiro:2000:COC, author = {Francisco Brasileiro and {Fab\'\i{}ola} Greve and Achour {Most\'efaoui} and Michel Raynal}, title = {Consensus in one communication step}, institution = {IRISA}, year = {2000}, OPTkey = {}, OPTtype = {}, number = {PI-1321}, address = {Rennes, France}, OPTmonth = {}, OPTnote = {}, annote = {[to read]} } @InProceedings{Breitling:2000:MFD, author = {Max Breitling}, title = {Modeling faults of distributed, reactive systems}, booktitle = {Formal Techniques in Real-Time and Fault-Tolerant Systems, 6th International Symposium (FTRTFT 2000) Proceedings}, OPTcrossref = {}, OPTkey = {}, pages = {58--69}, year = {2000}, editor = {Mathai Joseph}, OPTvolume = {}, number = {1926}, series = ser-LNCS, address = {Pune, India}, month = sep, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {Models faults as addition of variables and transitions in a special formalism (Fokus) which supports compositionality and refinement.} } @Article{Buschek:2000:M4W, author = {Oliver Buschek}, title = {{Mit dem 486er zur Raumstation}}, journal = {Chip}, year = {2000}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, pages = {92--98}, month = feb, OPTnote = {}, annote = {Leicht verstaendlicher Einstieg in Themen der Fehlertoleranz im Weltraum. Fokus auf ISS: Dort sind 6fach redundante Schuhschachteln, die Byzantinisches Agreement machen, drin. Gibt auch Hinweise auf Webadressen der Nasa und ESA.} } @Misc{Cachin:2000:RMU, OPTkey = {}, author = {C. Cachin and J. Camenisch and M. Dacier and Y. Deswarte and J. Dobson and D. Horne and K. Kursawe and J.-C. Laprie and J.-C. Lebraud and D. Long and T. McCutcheon and J. {M\"uller} and F. Petzold and B. Pfitzmann and D. Powell and B. Randell and M. Schunter and V. Shoup and P. Ver{\'\i}ssimo and G. Trouessin and R. J. Stroud and M. Waidner and I. S. Welch}, title = {Reference Model and Use Cases}, OPThowpublished = {}, month = aug, year = 2000, note = {Deliverable D1 of the MAFTIA project \cite{MAFTIA}.}, OPTannote = {} } @InProceedings{Cachin:2000:ROC, author = {Christian Cachin and Klaus Kursawe and Victor Shoup}, title = {Random oracles in constantinople: practical asynchronous {Byzantine} agreement using cryptography}, booktitle = {Proceedings of the Symposium on Principles of Distributed Computing}, OPTcrossref = {}, OPTkey = {}, pages = {123--132}, year = {2000}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Portland, Oregon}, OPTmonth = {}, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {Looks at randomized Byzantine agreement and presents an optimistic solution using a randomized and cryptographically secure coin toss. This is a good example on what and where fault-tolerance can learn from cryptography.} } @InProceedings{Charron-Bost:2000:RSL, author = {Bernadette Charron-Bost and Sam Toueg and Anindya Basu}, title = {Revisiting safety and liveness in the context of failures}, booktitle = {Proceedings of CONCUR2000 -- Concurrency Theory, 11th Int. Conference}, OPTcrossref = {}, OPTkey = {}, pages = {552--565}, year = {2000}, editor = {C. Palamidessi}, OPTvolume = {}, number = {1877}, series = ser-LNCS, address = {University Park, PA}, month = aug, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {Agreement in consensus is defined as ``no two correct processes decide differently''. Against common belief, this is a liveness property in systems where processes may crash. This is because if two processes have decided differently, then agreement can still be achieved if one of them crashes. The authors define pure safety and pure liveness meaning that safety and liveness hold without ``the help or non-help of failures''. Pure liveness means that something good can still happen without the help of failures. Pure safety means that executions which do not satisfy the property must contain failures to satisfy the property. Pure versions are strictly weaker than the original versions. The authors define a property transformer `Pure' that `makes a property pure' by removing all executions which contain undesirable partial runs. Pure agreement demands that no two alive processes decide differently and comes closer to our intuition of agreement in consensus. Pure agreement is stronger than uniform agreement but weaker than agreement. Shows that every pure property is the intersection of a pure safety and a pure liveness property. A startling paper which demands more investigation!} } @InProceedings{Charron-Bost:2000:SSP, author = "Bernadette Charron-Bost and Rachid Guerraoui and {Andr\'e} Schiper", title = "Synchronous System and Perfect Failure Detector: Solvability and Efficiency Issues", booktitle = "International Conference on Dependable Systems and Networks (IEEE Computer Society)", year = "2000", annote = "Looks at the relation between the synchronous system model and the asynchronous model augmented with perfect failure detectors. They show that there are problems which are solvable in synchronous systems but are unsolvable in asynchronous systems with perfect failure detectors. Hence, both models are not equivalent in this respect. This is because failure detectors give no information on the causal relation between the crash event and other events on the crashed process. This means that you cannot decide whether there is still a message in transit coming from the crashed process or not. If you want to base a decision on this fact you have the same dilemma as in FLP \cite{Fischer:1985:IDC}. But if it comes to consensus, both models are ok because you can solve this problem in both. However, in synchronous systems algorithms can be constructed with a lower latency degree \cite{Schiper:1997:ECA} so more efficient solutions are possible in the synchronous model." } @InProceedings{Chen:2000:QOS, author = {Wei Chen and Sam Toueg and Marcos Kawazoe Aguilera}, title = {On the quality of service of failure detectors}, booktitle = {Proceedings of the International Conference on Dependable Systems and Networks (DSN 2000)}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {2000}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {New York}, month = jun, OPTorganization = {}, publisher = pub-IEEE, OPTnote = {}, annote = {In a system model where message delays and losses follow some probability distribution the authors study performance metrics regarding the accuracy and completeness of the failure detectors which were introduced in the time free model \cite{Chandra:1996:UFD}. Metrics concerning completeness are the detection time (i.e. the time between the crash and the detection of the crash). Metrics concerning accuracy are mistake recurrence time (the time between two successive mistakes) and mistake duration (the time it takes to correct a mistake). Other accuracy metrics can be derived from them (average mistake rate, query accuracy probability, good period duration, forward good period duration). An algorithm is presented which achieves optimality concerning some metrics and is based on synchronzed clocks: a timeout is started not when a hartbeat arrives but at certain freshness points which are at equal intervals at both processes (with a message delay difference). Discusses how to tune the parameters of the algorithm to perform nearly optimal and presents some ideas concerning adaptivity. Gives an overview over other failure detection approaches in the literature.} } @Article{Crawford:2000:BNP, author = {Gregory P. Crawford}, title = {A bright new page in portable displays}, journal = {IEEE Spectrum}, year = {2000}, OPTkey = {}, volume = {37}, number = {10}, pages = {40--46}, month = oct, OPTnote = {}, annote = {Gives insight in new display technology aka smart paper. Presents some fascinating photos of a cholestoric LCD display of Kent Displays Inc., Kent, Ohio, which reflective (needs no back light) and does not need power to hold the image. Also describes the technologies behind this display and Gyricon (Xerox) and E ink.} } @PhdThesis{Defago:2000:ARP, author = {Xavier {D\'efago}}, title = {Agreement-related problems: from semi-passive replication to totally ordered broadcast}, school = {{\'Ecole Polytechnique F\'ed\'erale de Lausanne}}, year = {2000}, OPTkey = {}, OPTtype = {}, address = {Lausanne, Switzerland}, OPTmonth = {}, note = {Thesis number 2229}, OPTannote = {} } @Article{Ditlea:2000:PCG, author = {Steve Ditlea}, title = {The {PC} goes ready-to-wear}, journal = {IEEE Spectrum}, year = {2000}, OPTkey = {}, volume = {37}, number = {10}, pages = {34--39}, month = oct, OPTnote = {}, annote = {This is more a market survey of wearables, presenting display technology, prototypes (Xybernaught, IBM etc) and e.g. Twiddler chorded keyboard. For a visionary article see \cite{Billinghurst:1999:WDN}. } } @Book{Dolev:2000:SS, author = {Shlomi Dolev}, ALTeditor = {}, title = {Self-Stabilization}, publisher = {MIT Press}, year = {2000}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, OPTannote = {} } @InProceedings{Echtle:2000:FFM, author = {Klaus Echtle and Asif Masum}, title = {A fundamental failure model for fault-tolerant protocols}, booktitle = {Proceedings of the IEEE International Computer Performance and Dependability Symposium (IPDS2K)}, OPTcrossref = {}, OPTkey = {}, pages = {69--78}, year = {2000}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Chicago, IL}, OPTmonth = {}, OPTorganization = {}, publisher = pub-IEEE, OPTnote = {}, annote = {See also \cite{Echtle:1999:UCB,Masum:2000:NCB}. A more elaborate description is attached to the entry of \cite{Echtle:1999:UCB}.} } @InProceedings{Gaertner:2000:CDG, author = {Felix C. G\"artner and Sven Kloppenburg}, title = {Consistent Detection of Global Predicates Under a Weak Fault Assumption}, booktitle = pro-srds2000, OPTcrossref = {}, OPTkey = {}, pages = {94--103}, year = {2000}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {N\"urnberg, Germany}, month = oct, OPTorganization = {}, publisher = pub-IEEE, OPTnote = {}, OPTannote = {} } @TechReport{Gaertner:2000:RIS, author = {Felix C. {G\"artner} and Hagen {V\"olzer}}, title = {Redundancy in space in fault-tolerant systems}, institution = {Department of Computer Science, Darmstadt University of Technology}, year = {2000}, OPTkey = {}, OPTtype = {}, number = {TUD-BS-2000-06}, address = {Darmstadt, Germany}, month = jul, url = "\url{http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-2000-06.ps.gz}", OPTnote = {}, OPTannote = {} } @InProceedings{Hiller:2000:EAD, author = {Martin Hiller}, title = {Executable assertions for detecting data errors in embedded control systems}, OPTcrossref = {}, OPTkey = {}, booktitle = {Proceedings of the International Conference on Dependable Systems and Network (DSN 2000)}, pages = {24--33}, year = {2000}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTmonth = {}, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, OPTannote = {} } @InProceedings{Huang:2000:TFP, author = {Shing-Tsaan Huang}, title = {The fuzzy philosophers}, booktitle = {Proceedings of the 15th IPDPS 2000 Workshops}, OPTcrossref = {}, OPTkey = {}, pages = {130--136}, year = {2000}, editor = {J. Rolim et al.}, volume = {1800}, OPTnumber = {}, series = ser-LNCS, address = {Cancun, Mexico}, month = may, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {Generalization of the dining philosophers and a self-stabilizing solution.} } @Article{Hutter:2000:AII, author = {Reinhard Hutter}, title = {{Angriffe auf Informationstechnik und Infrastrukturen -- Realit\"at oder Science Fiction?}}, journal = {Aus Politik und Zeitgeschichte}, year = {2000}, OPTkey = {}, volume = {41--42}, OPTnumber = {}, pages = {31--38}, OPTmonth = {}, OPTnote = {}, annote = {Gute Einfuehrung in und Referenz zu kritischen Infrastrukturen, eher aus allgemeinverstaendlicher und politikwissenschaftlicher Sicht.} } @Article{Jones:2000:CBS, author = {Anita Jones}, title = {The challenge of building survivable information-intensive systems (introduction to special issue on ``critical infrastructures'')}, journal = {IEEE Computer}, year = {2000}, OPTkey = {}, volume = {33}, number = {8}, pages = {39--43}, month = aug, OPTnote = {}, annote = {A German Journal with similar directions appeared at about the same time \cite{Bernhardt:2000:RDR}.} } @InProceedings{Karjoth:2000:SMA, author = {G\"{u}nter Karjoth}, title = "Secure Mobile Agent-Based Merchant Brokering in Distributed Marketplaces", booktitle = asama2000, pages = "44--56", year = 2000, address = "Zurich, Switzerland", month = sep, volume = "1882", series = ser-LNCS, publisher = pub-SV, keyword = "agents, e-commerce, security, mobile agent", abstract = {Cooperating merchants establish a distributed marketplace under the auspices of an independent market authority. Each merchant's server is equipped with a trusted device, a smart card for example, provided by the market authority. The market authority plays the role of a trusted third party for the customer as well as for the merchants. This paper describes protocols that prevent the malicious alteration of the data collected by visiting mobile agents roaming through the marketplace without being detectable by subsequent servers or by the owner of the agent upon its return. Another protocol makes the trusted device a secure execution platform for routines provided by the agent owner. } } @Article{Kehr:2000:SV, author = {Roger Kehr}, title = {Spontane {Vernetzung}}, journal = {Informatik Spektrum}, year = {2000}, OPTkey = {}, volume = {23}, number = {3}, pages = {161--172}, month = jun, OPTnote = {}, annote = {Good survey of the three main methods for spontaneous networking (Jini, SLP, UPnP). Also discusses Bluetooth. Good starting point for german readers.} } @TechReport{Kloppenburg:2000:CDG, author = {Sven Kloppenburg and Felix C. {G\"artner}}, title = {Consistent detection of global predicates in asynchronous systems with crash failures}, institution = {Darmstadt University of Technology, Department of Computer Science}, year = {2000}, OPTkey = {}, OPTtype = {}, number = {TUD-BS-2000-01}, address = {Darmstadt, Germany}, month = feb, url = "\url{http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-2000-01.abstract.html}", OPTnote = {}, OPTannote = {} } @InProceedings{Kulkarni:2000:AAF, author = {Sandeep S. Kulkarni and Anish Arora}, title = {Automating the addition of fault-tolerance}, booktitle = {Formal Techniques in Real-Time and Fault-Tolerant Systems, 6th International Symposium (FTRTFT 2000) Proceedings}, OPTcrossref = {}, OPTkey = {}, pages = {82--93}, year = {2000}, editor = {Mathai Joseph}, OPTvolume = {}, number = {1926}, series = ser-LNCS, address = {Pune, India}, month = sep, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {Looks at automatically adding detectors and correctors in the sense of \cite{Arora:1998:DCT} to existing programs. Specifications are fusion- and suffix-closed, giving ``bad'' transitions which violate safety. Idea of fail-safe fault-tolerance is to cut away all paths leading to these bad transitions. This must be possible without changing the original behavior. Non-masking fault-tolerance is achieved by adding transitions from all states outside of the invariant to states within. Masking fault-tolerance is somewhat more complex. States that adding fault-tolerance is NP-complete but refers the proof to a TR. From a conceptual point of view is similar to \cite{Gaertner:2000:RIS}. An important point in the transformation is that the fault-tolerant version must not contain ``new'' ways to satisfy the specification.} } @Article{Kumagai:2000:LEV, author = {Jean Kumagai}, title = {faults \& failures: {London} stock exchange vanishes for 8 hours}, journal = {IEEE Spectrum}, year = {2000}, OPTkey = {}, volume = {37}, number = {6}, pages = {30--31}, month = jun, OPTnote = {}, annote = {Sketches the 8 hour blackout of the London stock exchange (LSE) on April 5, 2000. Slow overnight batch jobs had caused old prices to get mixed up with new prices. Frantic calls from traders pursuaded the LSE to delay trading until the problem was fixed (trading is useless with wrong prices). This delay lasted 8 hours. The reason for the slow batch job was an inherent program inefficiency combined with an unusually high volume of data. Fixing required rewriting a couple of lines of code --- ``absolutely trivial''. Costs are estimated in the millions of pounds.} } @Article{Lamport:2000:FAH, author = "Leslie Lamport", title = "Fairness and hyperfairness", pages = "239--245", year = "2000", abstract = "The notion of fairness in trace-based formalisms is examined. It is argued that, in general, fairness means machine closure. The notion of hyperfairness introduced by Attie, Francez, and Grumberg is generalized to arbitrary action systems. Also examined are the fairness criteria proposed by Apt, Francez, and Katz.", url = "http://link.springer.de/link/service/journals/00446/papers/0013004/00130239.pdf", volume = "13", number = "4", journal = "Distributed Computing", annote = "There's a good quote here about reasoning about liveness properties: ``Fairness condiations are a way of expressing liveness properties, and liveness properties are inherently problematic. The question of whether a real system satisfies a liveness property is meaningless; it can be answered only by observing the system for an infinite length of time, and real systems don't run forever. Liveness is always an approximation to the property we really care about. We want a program to terminate within 100 years, but proving that it does would require addition of distracting timing assumptions. So, we prove the weaker condition that the program eventually terminates. This doesn't prove that the program will terminate within our lifetimes, but it does demonstrate the absence of infinite loops.'' This is a must-read paper for people interested in liveness issues." } @InProceedings{Lano:2000:IBS, author = {K. Lano and David Clark and K. Androutsopoulos and P. Kan}, title = {Invariant-based synthesis of fault-tolerant systems}, booktitle = {Formal Techniques in Real-Time and Fault-Tolerant Systems, 6th International Symposium (FTRTFT 2000) Proceedings}, OPTcrossref = {}, OPTkey = {}, pages = {46--57}, year = {2000}, editor = {Mathai Joseph}, OPTvolume = {}, number = {1926}, series = ser-LNCS, address = {Pune, India}, month = sep, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {Uses a precise formal semantics of statecharts to compositionally develop and verify systems. Presents a fault-tolerant production cell as a case study.} } @TechReport{Larrea:2000:ECF, author = {Mikel Larrea and Antonio {Fern\'andez} and Sergio {Ar\'valo}}, title = {Eventually consistent failure detectors}, institution = {Universidad {P\'ublica} de Navarra, Spain}, year = {2000}, OPTkey = {}, OPTtype = {}, OPTnumber = {}, OPTaddress = {}, month = apr, note = {Presented as a brief announcement at DISC2000}, url = "\url{http://www.gsd.unavarra.es/pres/miembros/mikel/consistent.ps}", annote = {A new class of failure detectors is presented called `eventually consistent'. The weak accuracy property is enriched by a function with which processes can identify the `sommon one' process which is not wrongly suspected. This can be seen as a type of leader election capability. Eventually consistent failure detectors lie between eventually perfect and eventually strong ones. The additional information offered by this failure detector allows more efficient consensus algorithms. Since everybody eventually focusses on one and the same process as a coordinator, consensus algorthms are possible which do not rely on the rotating coordinator paradigm. This is part of Mikel's PhD research (see also \cite{Larrea:2000:OIW}).} } @InProceedings{Larrea:2000:OIW, author = {Mikel Larrea and Antonio Fern\'andez and Sergio Ar\'evalo}, title = {Optimal Implementation of the Weakest Failure Detector for Solving Consensus}, booktitle = pro-srds2000, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {2000}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {N\"urnberg, Germany}, month = oct, OPTorganization = {}, publisher = pub-IEEE, OPTnote = {}, OPTannote = {} } @InProceedings{Mantel:2000:CSM, author = {Heiko Mantel and Felix C. {G\"artner}}, title = {A case study in the mechanical verification of fault tolerance}, booktitle = {Proceedings of the 13th International Florida Artificial Intelligence Conference (FLAIRS-2000)}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {2000}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Orlando, FL}, month = may, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, annote = {Preliminary version available as TR \cite{Mantel:1999:CSM}.} } @Article{Mantel:2000:ACS, author = {Heiko Mantel and Felix C. {G\"artner}}, title = {A case study in the mechanical verification of fault tolerance}, journal = {Journal of Experimental \& Theoretical Artificial Intelligence (JETAI)}, year = {2000}, OPTkey = {}, volume = {12}, number = {4}, pages = {473--488}, month = oct, OPTnote = {}, OPTannote = {} } @PhdThesis{Masum:2000:NCB, author = {Asif Masum}, title = {Non-cooperative {Byzantine} failures: {A} new framework for the design of efficient fault tolerance protocols}, school = {Universit\"at-Gesamthochschule Essen, Fachbereich Mathematik und Informatik}, year = {2000}, OPTkey = {}, OPTtype = {}, OPTaddress = {}, OPTmonth = {}, note = {Published by Libri Books on demand, ISBN 3-8311-0815-3.}, annote = {Conference version e.g. \cite{Echtle:1999:UCB}. Good overview over failure classification schemes.} } @article{Matsui:2000:FTS, volume ={E83-D}, number ={10}, pages ={1831--1840}, year ={2000}, month =oct, journal ={IEICE Transactions}, publisher ={Institute of Electronics, Information and Communication Engineers}, title ={Fault-Tolerant and Self-Stabilizing Protocols Using an Unreliable Failure Detector}, author ={H. Matsui and M. Inoue and T. Masuzawa and H. Fujiwara}, abstract ={We investigate possibility of fault-tolerant and self-stabilizing protocols (ftss protocols) using an unreliable failure detector. Our main contribution is (1) to newly introduce k-accuracy of an unreliable failure detector, (2) to show that k-accuracy of a failure detector is necessary for any ftss k-group consensus protocol, and (3) to present three ftss k-group consensus protocols using a k-accurate and weakly complete failure detector under the read/write daemon on complete networks and on (n-k+1)-connected networks, and under the central daemon on complete networks.}, keywords ={distributed algorithms; self-stabilization; fault-tolerance; failure detector; x-group consensus}, annote = "The term $k$-accuracy means that at least $k$ correct processes will not be wrongly suspected by the failure detector. $k=1$ is the same as weak accuracy while $k=n-t$ is the same as strong accuracy. (See also the eventual consistency definition of \cite{Larrea:2000:ECF}.) In a $k$-group consensus protocol all correct processes must eventually choose the same group of $k$ processes. This looks something like self-stabilizing $k$ leader election." } @InProceedings{Mittal:2000:DDP, author = "Neeraj Mittal and Vijay K. Garg", title = "Debugging Distributed Programs Using Controlled Re-execution", pages = "239--248", booktitle = "Proceedings of the 19th Annual {ACM} Symposium on Principles of Distributed Computing ({PODC}-00)", month = jul # " ~16--19", publisher = "ACM Press", address = "NY", year = "2000", annote = "Controlled re-execution means to execute a distributed program so that a given safety property is maintained during that execution. The authors identify a class of predicated for which this can be done efficiently, i.e. without much synchronization. There are some resemblances here to Schneider's enforceable security policies cite{Schneider:2000:ESP}." } @InProceedings{Mostefaoui:2000:KSA, author = "Achour {Most\'efaoui} and Michel Raynal", title = "{\it{k}}-Set Agreement with Limited Accuracy Failure Detectors", pages = "143--152", booktitle = "Proceedings of the 19th Annual {ACM} Symposium on Principles of Distributed Computing ({PODC}-00)", month = jul # " ~16--19", publisher = "ACM Press", address = "NY", year = "2000", annote = "Looks at the $k$-set agreement of \cite{Chaudhuri:1990:AHC} and shows the possibility and impossibility of solving it under different assumptions which include a failure detector with limited scope. Informally, the scope of the accuracy property is the number of processes that may not suspect a correct process." } @InProceedings{Namjoshi:2000:CCR, author = {Kedar S. Namjoshi and Richard J. Trefler}, title = {On the completeness of compositional reasoning}, booktitle = {Proceedings of the 12th Int. Conference on Computer Aided Verification (CAV2000)}, OPTcrossref = {}, OPTkey = {}, pages = {139--153}, year = {2000}, OPTeditor = {}, OPTvolume = {}, number = {1855}, series = ser-LNCS, OPTaddress = {}, month = jul, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {Gives examples of non-circular compositional reasoning, unlike \cite{Abadi:1993:CS} which is also shown to be incomplete.} } @Article{Oberg:2000:NBP, author = {James Oberg}, title = {{NASA's} big push for the space station}, journal = {IEEE Spectrum}, year = {2000}, OPTkey = {}, volume = {37}, number = {11}, pages = {49--54}, month = nov, OPTnote = {}, annote = {Describes problems and workarounds while deploying the new space station ISS. States that the software on the ISS is far from well tested because of the tight schedule. Cite: ``We launched the Space Shuttle when we were 90 percent ready, but we're launching Space Station at only 50 percent.'' An example of planning flaws is the construction of Plasma Contact Units (PCU): because the ISS runs 130-180 V power (instead of 24-28 V in earlier designs) and orbits in thin plasma, a voltage threshold for arcing (which is at about 40 to 60 Vdc is surpassed by the outer skin of the spacecraft which endangers solar cells and outboard equipment and causes hazards for astronauts on space walks. Two PCUs were added to the design which are ion beams constantly shooting ions into space to decrease the electric potential. If one PCU breaks down, the other can still relieve the potential, but to fix a broken PCU a spacewalk is required! (The procedures now are to shut down part of the ISS in this situation and only run 24-28 V during repair.) Astronauts use IBM 760 Thinkpad laptop computers on board! Shows that it is good to still rely on heavy duty experienced technology like Mir and Sojus.} } @InProceedings{Pagnia:2000:SFE, author = {Henning Pagnia and Holger Vogt and Felix C. G\"artner and Uwe G. Wilhelm}, title = "Solving Fair Exchange with Mobile Agents", booktitle = asama2000, pages = "57--72", year = 2000, address = "Zurich, Switzerland", month = sep, volume = "1882", series = ser-LNCS, publisher = pub-SV, keyword = "mobile agent, e-commerce, security", abstract = { Mobile agents have been advocated to support electronic commerce over the Internet. While being a promising paradigm, many intricate problems need to be solved to make this vision reality. The problem of \emph{fair exchange} between two agents is one such fundamental problem. Informally speaking, this means to exchange two electronic items in such a way that neither agent suffers a disadvantage. We study the problem of fair exchange in the mobile agent paradigm. We show that while existing protocols for fair exchange can be substantially simplified in the context of mobile agents, there are still many problems related to security which remain difficult to solve. We propose three increasingly flexible solutions to the fair exchange problem and show how to implement them using existing agent technology. The basis for ensuring the security properties of fair exchange is a tamper-proof hardware device called a trusted processing environment. }, } @Article{Perry:2000:DAR, author = {Tekla S. Perry}, title = {faults \& failures: Does anybody really know what time it is?}, journal = {IEEE Spectrum}, year = {2000}, OPTkey = {}, volume = {37}, number = {10}, pages = {26--28}, month = oct, OPTnote = {}, annote = {Another amusing story in this regular column: studies the reasons behind the problem of VCRs not adjusting to the right time. For much of 1999, video cassette recorders (VCRs) around the U.S. were showing the wrong time. It affected only machines which had an automatic time adjuster builtin. This adapter reads the time which is broadcasted as part of the public broadcasting service (PBS, in German it's Videotext) and adjusts the VCRs clock to it. The reason for this fault was an incorrect time broadcasted by some PBS stations and it took months to locate it. This was due to hardly any user response (an article of a journalist triggered a wide response after months) and due to PBS providers not knowing how to set the broadcasted time correctly.} } @InProceedings{Pleisch:2000:MFT, author = {Stefan Pleisch and {Andr\'e} Schiper}, title = {Modeling fault-tolerant mobile agent execution as a sequence of agreement problems}, booktitle = pro-srds2000, OPTcrossref = {}, OPTkey = {}, pages = {11--20}, year = {2000}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {N\"urnberg, Germany}, month = oct, OPTorganization = {}, publisher = pub-IEEE, OPTnote = {}, OPTannote = {} } @TechReport{Prasetya:2000:FFT, author = {I. S. W. B. Prasetya and S. D. Swierstra}, title = {Factorizing Fault Tolerance}, institution = {University of Utrecht, Department of Computer Science}, year = {2000}, OPTkey = {}, OPTtype = {}, number = {UU-CS-2000-02}, address = {Utrecht, The Netherlands}, OPTmonth = {}, note = {Appears in special issue of TCS on fault tolerance}, annote = {This is the paper which first introduced me to the issue of composition of liveness properties. The paper proposes a composition law which is based on the notion of `temporal non-interference'. This means the following: Given a component $P$ which satisfies $p\leadsto q$ and a component $Q$ which does not interfere with $P$'s progress as long as some flag $a$ is high, then the parallel composition of $P$ and $Q$ satisfies $p\leadsto q$ if $P$ raises $a$ long enough. The point is that unlike the usual composition of e.g. self-stabilizing algorithms (like in \cite{Herman:1991:ATD,Dolev:1993:SDS} and also \cite{Gouda:1991:SCP}) the component $Q$ may interfere with $P$ at some times (but only after $P$ has reached progress). The composition law is applied in an example where fault-tolerance is achieved through exception handling. The framework is built on top of UNITY \cite{Chandy:1988:PPD} and checked using HOL.} } @Article{Randell:2000:TML, author = "Brian Randell", title = "{Turing Memorial Lecture}: Facing Up to Faults", journal = j-COMP-J, volume = "43", number = "2", pages = "95--106", year = "2000", url = "http://www3.oup.co.uk/computer_journal/hdb/Volume_43/Issue_02/430095.sgm.abs.html; http://www3.oup.co.uk/computer_journal/hdb/Volume_43/Issue_02/430095.pdf", annote = "A wise and cunning look back at the central problems in fault tolerance from the viewpoint of one of the big men. Mentions Babbage's concern about correct mathematical navigation tables (see also \cite{Bowen:1993:SCS}) and his first ideas of n version programming. Looks on the necessity of fault-tolerant computing (``the more dependable computing systems become, the more dependence is placed on them''). Recalls concepts from \cite{Laprie:1992:DBC} and explicitly notes that the quality of fault-tolerance depends heavily on the quality of the fault assumption (p.100). Quote: ``Yet all too often, inadequate attention i paid to identifying and justifying a set of fault assumptions''. Notes the problems with feature interaction and non-interference when it comes to compositionality. Quote: ``All fault tolerance involves the use of redundancy---of representation and/or activity---whose consistency can be checked.'' Notes that notions of diversity are not very well understood and that ad hoc standards in operating systems are a problem when it comes to fault tolerance through system diversity." } @Article{Schoder:2000:TOR, author = "Detlef Schoder and Torsten Eymann", title = "Technical opinion: The real challenges of mobile agents", journal = j-CACM, volume = "43", number = "6", pages = "111--112", month = jun, year = "2000", coden = "CACMA2", ISSN = "0001-0782", bibdate = "Mon Sep 25 15:22:32 MDT 2000", url = "http://www.acm.org/pubs/citations/journals/cacm/2000-43-6/p111-schoder/", acknowledgement = ack-nhfb, subject = "Computer Systems Organization --- Computer-Communication Networks --- General (C.2.0); Computer Systems Organization --- Computer-Communication Networks --- Distributed Systems (C.2.4); Computing Methodologies --- Artificial Intelligence --- Distributed Artificial Intelligence (I.2.11)", annote = "states that mobile agents should have a kind of self-stabilizing social behavior. Contrasts nicely to \cite{Lange:1999:SGR}." } @Article{Schulzki:2000:KI, author = {Christiane Schulzki-Haddouti}, title = {{Kritische Infrastrukturen}}, journal = {FIfF Kommunikation}, year = {2000}, OPTkey = {}, OPTvolume = {}, OPTnumber = {3}, pages = {19--20}, month = sep, OPTnote = {}, annote = {Teil des Sonderheftes \cite{Bernhardt:2000:RDR}.} } @Article{Schumacher:2000:AI, author = {M. Schumacher and M.L. Moschgath and U. Roedig}, title = {{Angewandte Informationssicherheit} --- {Ein Hacker-Praktikum an Universit\"aten}}, journal = {Informatik Spektrum}, year = {2000}, OPTkey = {}, volume = {23}, number = {3}, pages = {202--211}, month = jun, OPTnote = {}, annote = {Presents an interesting course taught at TU Darmstadt: Students had to attack and defend a network of PCs to learn the practices of ``real'' network security.} } @InProceedings{Stoller:2000:EDG, author = {Scott D. Stoller and Leena Unnikrishnan and Yanhong A. Liu}, title = {Efficient detection of global properties in distributed systems using partial-order methods}, booktitle = {Computer Aided Verification (CAV 2000)}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {2000}, OPTeditor = {}, volume = {1855}, OPTnumber = {}, series = ser-LNCS, OPTaddress = {}, OPTmonth = {}, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, annote = {Uses the ``persistent-set technique'' (a method known from partial order research to optimize state space search) to detect possibly and definitely in distributed computations. The algorithm is compared to two special case algorithms by Garg and Waldecker and it is shown to (a) handle a larger class of predicates, and (b) have the same worst case aymptotic time complexity. Results are backed by simulation data.} } @Book{Tel:2000:IDA, author = {Gerard Tel}, ALTeditor = {}, title = {Introduction to Distributed Algorithms}, publisher = {Cambridge University Press}, year = {2000}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, edition = {Second}, OPTmonth = {}, OPTnote = {}, OPTannote = {} } @InProceedings{Verissimo:2000:TCB, author = "Paulo Ver\'{\i}ssimo and Antonio Casimiro and Christof Fetzer", title = "The Timely Computing Base: Timely Actions in the Presence of Uncertain Timeliness", booktitle = "Proceedings of the International Conference on Dependable Systems and Networks", year = "2000", abstractURL = "http://www.di.fc.ul.pt/~casim/papers/dsn00/dsn00.html", documentURL = "http://www.di.fc.ul.pt/~casim/papers/dsn00/dsn00.ps.gz", pages = "533--542", publisher = "IEEE Computer Society Press", address = "New York City, USA", month = jun, annote = "[to read]" } @PhdThesis{Voelzer:2000:FRK, author = {Hagen {V\"olzer}}, title = {{Fairness, Randomisierung und Konspiration in verteilten Algorithmen}}, school = {Humboldt Universit\"at zu Berlin, Fakult\"at f\"ur Informatik}, year = {2000}, OPTkey = {}, OPTtype = {}, OPTaddress = {}, month = dec, OPTnote = {}, OPTannote = {}, url = "\url{http://dochost.rz.hu-berlin.de/abstract.php3/dissertationen/voelzer-hagen-2000-12-08}", } @Article{Wang:2000:PDA, author = "Wenli Wang and Zolt{\'a}n Hidv{\'e}gi and Andrew D. {Bailey, Jr.} and Andrew B. Whinston", title = "{E}-Process Design and Assurance Using Model Checking", journal = "Computer", volume = "33", number = "10", pages = "48--53", month = oct, year = "2000", url = "http://www.computer.org/computer/co2000/rx048abs.htm; http://dlib.computer.org/co/books/co2000/pdf/rx048.pdf", abstract = "Using a simple online ticket sales example and the authors demonstrate that model checking can help businesses verify their e-processes.", annote = "Shows that with model checking you can do model checking. Nothing particular to e-commerce or security (unfortunately)." } @Book{Bergstra:2001:HPA, editor = {Jan A. Bergstra and Alban Ponse and Scott A. Smolka}, title = {Handbook of Process Algebra}, publisher = {North-Holland}, year = {2001}, OPTkey = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTedition = {}, OPTmonth = {}, OPTnote = {}, OPTannote = {schon da?} } @Article{Furse:2001:DTW, author = {Cynthia Furse and Randy Haupt}, title = {Down to the Wire}, journal = {IEEE Spectrum}, year = {2001}, OPTkey = {}, volume = {38}, number = {2}, pages = {34--39}, month = feb, OPTnote = {}, annote = {Drastic feature about the risks of aging wiring in aircraft. Airplanes stay in use for more than 20 years and so many parts are in danger of failing because of age. Especially wires are critical because the aircraft is full of them and they cannot be easily replaced. Studies show that in 20+ years old aircraft there is between 1.6 and 13 cracks per 1000 meter wires (there are about 240 km of wire in a Lockheed L-1011). Similar things count for military jets which stay in operation much longer (B-52s for example for 80 years). Faults can lead to sparks, fire, information loss, transient communication loss. Diagnosis tools are already good, but what is needed is prognosis.} } @PhdThesis{Gaertner:2001:FGF, author = {Felix C. {G\"artner}}, title = {Formale Grundlagen der Fehlertoleranz in verteilten Systemen}, school = {Fachbereich Informatik, TU Darmstadt}, year = {2001}, OPTkey = {}, OPTtype = {}, OPTaddress = {}, month = may, note = {}, OPTannote = {}, url = "\url{http://elib.tu-darmstadt.de/diss/000162/}", } @InProceedings{Gaertner:2001:DRF, author = {Felix C. G\"artner and Hagen V\"olzer}, title = {Defining Redundancy in Fault-Tolerant Computing}, booktitle = {Brief Announcement at the 15th International Symposium on DIStributed Computing (DISC 2001)}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {2001}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Lisbon, Portugal}, month = oct, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, OPTannote = {} } @TechReport{Gaertner:2001:GIF, author = {Felix C. G\"artner}, title = {A gentle introduction to failure detectors and related problems}, institution = {Darmstadt University of Technology, Department of Computer Science}, year = {2001}, OPTkey = {}, OPTtype = {}, number = {TUD-BS-2001-01}, OPTaddress = {}, month = apr, OPTnote = {}, url = "\url{http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-2001-01.ps.gz}", annote = {A more informal introduction to defining and using unreliable failure detectors \cite{Chandra:1996:UFD} in the design and analysis of fault tolerant distributed algorithms.} } @InProceedings{Gaertner:2001:IPD, author = {Felix C. G\"artner and Stefan Pleisch}, title = {{(Im)Possibilities} of predicate detection in crash-affected systems}, booktitle = {Proceedings of the 5th Workshop on Self-Stabilizing Systems (WSS2001)}, OPTcrossref = {}, OPTkey = {}, OPTpages = {}, year = {2001}, OPTeditor = {}, OPTvolume = {}, number = {2194}, pages ={98--113}, series = ser-LNCS, address = {Lisbon, Portugal}, month = oct, OPTorganization = {}, publisher = pub-SV, note = {}, OPTannote = {} } @TechReport{Gaertner:2001:IPDIBM, author = {Felix C. {G\"artner} and Stefan Pleisch}, title = {{(Im)Possibilities} of Predicate Detection in Crash-Affected Systems}, institution = {IBM Research Laboratory, Zurich}, year = {2001}, OPTkey = {}, type = {Research Report}, number = {RZ 3361 (\# 93407)}, address = {}, month = aug, url = "\url{http://domino.watson.ibm.com/library/CyberDig.nsf/Search}", OPTnote = {}, OPTannote = {} } @Misc{LeLann:2001:ART, OPTkey = {}, author = {Gerard LeLann}, title = {Is asynchronous real-time an oxymoron?}, howpublished = {Invited presentation at the 15th International Symposium on DIStributed Computing (DISC 2001)}, month = oct, year = {2001}, note = {Lisbon, Portugal}, OPTannote = {related reference is \cite{LeLann:1995:ORN}. Is this published anywhere?} } @InProceedings{Aguilera:2002:OIF, author = {Marcos K. Aguilera and {G\'erard} Le Lann and Sam Toueg}, title = {On the impact of fast failure detectors on real-time fault-tolerant systems}, booktitle = {Proceedings of the 16th International Symposium on DIStributed Computing (DISC 2002)}, crossref = {Mahlki:2002:DC}, OPTkey = {}, pages = {354--369}, year = {2002}, editor = {Dahlia Malkhi}, OPTvolume = {}, number = {2508}, series = ser-LNCS, address = {Toulouse, France}, month = oct, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, OPTannote = {} } @InProceedings{Gaertner:2002:FDS, author = {Felix C. {G\"artner} and Stefan Pleisch}, title = {Failure detection sequencers: {Necessary} and sufficient information about failures to solve predicate detection}, booktitle = {Proceedings of the 16th International Symposium on DIStributed Computing (DISC 2002)}, crossref = {Mahlki:2002:DC}, OPTkey = {}, pages = {280--294}, year = {2002}, editor = {Dahlia Malkhi}, OPTvolume = {}, number = {2508}, series = ser-LNCS, address = {Toulouse, France}, month = oct, OPTorganization = {}, publisher = pub-SV, OPTnote = {}, OPTannote = {} } @TechReport{Gaertner:2002:FDSIBM, author = {Felix C. {G\"artner} and Stefan Pleisch}, title = {Failure detection sequencers: {Necessary} and sufficient information about failures to solve predicate detection}, institution = {IBM Research Laboratory, Zurich}, year = {2002}, OPTkey = {}, type = {Research Report}, number = {RZ 3438}, address = {}, OPTmonth = aug, url = "\url{http://domino.watson.ibm.com/library/CyberDig.nsf/Search}", OPTnote = {}, OPTannote = {} } @TechReport{Gaertner:2002:RLPCSS, author = {Felix C. {G\"artner}}, title = {Revisiting Liveness Properties in the Context of Secure Systems}, institution = {Swiss Federal Institute of Technology (EPFL), School of Computer and Communication Sciences}, year = {2002}, OPTkey = {}, OPTtype = {}, number = {200278}, address = {Lausanne, Switzerland}, month = nov, OPTnote = {}, OPTannote = {} } @Article{Guerraoui:2002:NBA, author = {Rachid Guerraoui}, title = {Non-Blocking Atomic Commitment in Asynchronous Systems with Failure Detectors}, journal = j-DC, year = {2002}, OPTkey = {}, volume = {15}, number = {1}, OPTpages = {}, OPTmonth = {}, OPTnote = {}, OPTannote = {} } @InProceedings{Guerraoui:2002:WFD, author = {Rachid Guerraoui and Petr Kouznetsov}, title = {On the weakest failure detector for non-blocking atomic commit}, OPTcrossref = {}, OPTkey = {}, booktitle = {Proceedings of the International Conference on Theoretical Computer Science (TCS 2002), 17th IFIP World Computer Congress}, OPTpages = {}, year = 2002, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, address = {Monteal, Canada}, month = aug, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, OPTannote = {} } @Article{Hermant:2002:FAU, author = {{Jean-Fran\c{c}ois} Hermant and {G\'erard} Le Lann}, title = {Fast asynchronous uniform consensus in real-time distributed systems}, journal = j-IEEE-TRANS-COMP, year = {2002}, OPTkey = {}, volume = {51}, number = {8}, pages = {931--944}, month = aug, OPTnote = {}, annote = {A very relavant paper regarding the practicality of the failure detector approach. The basic idea of the paper is to use the principle of `late binding' (known from programming languages) to build real-time distributed protocols from asynchronous solutions for the `time-free' version of the problem. The approach is as follows: for a real-time problem, (1) turn the specification into a time-free problem (e.g. by basing timeliness requirements on certain activation conditions using time-free extensions to the asynchronous model - like failure detectors), then devise an asynchronous solution, (2) design a solution to the time-free extension in an as weak partially synchronous model as possible, (3) if the original problem is a real-time problem or in case one needs to predict real-time behavior, bind the parameters of the time-free extension to some possibly stronger partially synchronous model and establish time bounds for the extension, from that establish time bounds for the overall algorithm. Why is late binding good? First of all, devising solutions in this way results in systems that satisfy safety and liveness with the highest amount of coverage possible under the fault assumption (the coverage of the asynchronous model - because it makes no assumption - is higher than any (partially) synchronous model).Second, early binding of a solution makes you have to reason about timing and scheduling even if the original problem is not a real-time computing problem. The paper shows how late binding can be done using uniform consensus based on a strong failure detector (using which algorithm?), implementing the failure detector in a real-time Ethernet, and from that deriving a fast uniform consensus algorithm. This approach also has the advantage that failure detection has expedited delivery and so the failure detection time can be magnitutes smaller than regular message delivery (see also \cite{Aguilera:2002:OIF}). The timed-asynchronous (TA) system model \cite{Cristian:1999:TAD} and the timely computing base (TCB) \cite{Verissimo:2000:TCB} all do early binding. These models try then to enforce timing assumptions by what here is called ``measure-compare-and-kill'' (similar to the ``process controlled crash'' explained in \cite[p.14]{Defago:2000:ARP} used in ISIS and other systems). This means that a continuing timing failure detection takes place and that late services are treated as omissions, and it assumes that every timing failure is detected to maintain the confidence in the correctness. However, this means to perform scheduling and real-time analyses almost everywhere in the system, which can be tough. If these bounds are violated, the system might even lose liveness. The paper concludes (rather strongly) that ``TA and TCB lead to inefficient working solutions.'' Overall, this paper is both conceptual and technical (with a lot of real-time stuff) and argues strongly for its points. Some material presented at DISC 2001 \cite{LeLann:2001:ART}.} } @InProceedings{Jhumka:2002:SDC, author = {Arshad Jhumka and Martin Hiller and Vilgot Claesson and Neeraj Suri}, title = {On systematic design of consistent executable assertions for distributed embedded software}, OPTcrossref = {}, OPTkey = {}, booktitle = {Proceedings of the ACM Joint Conference on Languages, Compilers and Tools for Embedded Systems/Software and Compilers for Embedded Systems (LCTES/SCOPES)}, pages = {74--83}, year = {2002}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, OPTmonth = {}, OPTorganization = {}, OPTpublisher = {}, OPTnote = {}, OPTannote = {} } @TechReport{Jhumka:2002:OSD, author = {Arshad Jhumka and Felix C. {Gärtner} and Christof Fetzer and Neeraj Suri}, title = {On Systematic Design of Fast and Perfect Detectors}, institution = {Swiss Federal Institute of Technology (EPFL), School of Computer and Communication Sciences}, year = {2002}, OPTkey = {}, OPTtype = {}, number = {200263}, address = {Lausanne, Switzerland}, month = sep, OPTnote = {}, OPTannote = {} } @InProceedings{Kulkarni:2002:CAF, author = {Sandeep S. Kulkarni and A. Ebnenasir}, title = {Complexity of adding failsafe fault-tolerance}, OPTcrossref = {}, OPTkey = {}, booktitle = {Proceedings of the 22nd IEEE International Conference on Distributed Computing Systems (ICDCS 2002)}, pages = {337--344}, year = {2002}, OPTeditor = {}, OPTvolume = {}, OPTnumber = {}, OPTseries = {}, OPTaddress = {}, month = jul, OPTorganization = {}, publisher = pub-IEEE, OPTnote = {}, OPTannote = {} } @Book{Malkhi:2002:DC, editor = {Dahlia Malkhi}, title = {Distributed Computing. 16th International Conference (DISC 2002)}, publisher = pub-SV, year = {2002}, OPTkey = {}, OPTvolume = {}, number = {2508}, series = ser-LNCS, address = {Toulouse, France}, OPTedition = {}, month = oct, OPTnote = {}, OPTannote = {} } @PhdThesis{Muehl:2002:FGF, author = {Gero {M\"uhl}}, title = {Large-Scale Content-Based Publish-Subscribe Systems}, school = {Fachbereich Informatik, TU Darmstadt}, year = {2002}, OPTkey = {}, OPTtype = {}, OPTaddress = {}, month = nov, note = {}, OPTannote = {}, url = "\url{http://elib.tu-darmstadt.de/diss/000274/}", } @Article{Pagnia:2003:FE, author = {Henning Pagnia and Holger Vogt and Felix C. {G\"artner}}, title = {Fair Exchange}, journal = j-COMP-J, year = {2003}, OPTkey = {}, volume = {46}, number = {1}, OPTpages = {}, OPTmonth = {}, OPTnote = {}, OPTannote = {} } @Misc{Hadzilacos:FFT, OPTcrossref = "", OPTkey = "", author = "Vassos Hadzilacos and Prasad Jayanti and Sam Toueg", title = "Fundamentals of Fault-Tolerant Distributed Computing", howpublished = "Forthcoming", OPTyear = "", OPTmonth = "", OPTnote = "", annote = "Referenced in \cite{Hadzilacos:1994:MAF} but obviously has not been published yet." } @Misc{MAFTIA, key = {MAFTIA}, OPTauthor = {}, title = {MAFTIA Home -- {Malicious- and Accidental-Fault Tolerance for Internet Applications}}, howpublished = {Internet: \url{http://www.newcastle.research.ec.org/maftia/}}, OPTmonth = {}, OPTyear = {}, OPTnote = {}, OPTannote = {} }