%% $Header: /users/fgaertner/cvsroot/felix/tex/bibliographies/felix-stabilization.bib,v 1.3 2002/11/26 19:19:20 fgaertner Exp $
%%% Edited by Felix Gaertner <felix at informatik.tu-darmstadt.de>
%%%
%%% ====================================================================
%%% BibTeX-file{
%%% author = "Felix C. Gaertner",
%%% version = "see RCS Header",
%%% date = "see RCS Header",
%%% time = "see RCS Header",
%%% filename = "felix-stabilization.bib",
%%% address = "EPFL, I&C, LPD, Switzerland",
%%% telephone = "+41-21 693 7501",
%%% FAX = "+41 21 693 7570",
%%% URL = "http://lpdwww.epfl.ch/fgaertner/",
%%% checksum = "XXX",
%%% email = "fgaertner at lpdmail.epfl.ch,
%%% fcg at acm.org (Internet)",
%%% codetable = "ISO/ASCII",
%%% keywords = "bibliography, stabilization, fault-tolerance",
%%% supported = "no",
%%% docstring = "This BibTeX file records books and articles
%%% about fault-tolerance, including topics
%%% like stabilization, self-stabilization and
%%% whatever seems important to me. The annote
%%% field contains short content descriptions
%%% for my own personal use which might be
%%% interesting for others too. The ISBN
%%% fields will be printed if the is-alpha.bst
%%% or is-plain.bst style files are used.
%%%
%%% BibTeX citation tags are uniformly chosen
%%% as name:year:abbrev, where name is the
%%% family name of the first author or editor,
%%% year is a 4-digit number, and abbrev is a
%%% 3-letter condensation of important title
%%% words. Citation tags were automatically
%%% generated by the biblabel software
%%% developed for the BibNet Project.
%%%
%%% In this bibliography, entries are sorted
%%% first by ascending year, and within each
%%% year, alphabetically by author or editor,
%%% and then, if necessary, by the 3-letter
%%% abbreviation at the end of the BibTeX
%%% citation tag, using the bibsort -byyear
%%% utility. Year order has been chosen to
%%% make it easier to identify the most recent
%%% work.
%%%
%%% The bibsort utility, and several related
%%% programs for bibliography maintenance, is
%%% available on ftp.math.utah.edu in
%%% /pub/tex/bib, and at other Internet sites
%%% which mirror it, including the
%%% Comprehensive TeX Archive Network (CTAN);
%%% the command `finger ctan<at>pip.shsu.edu'
%%% will produce a list of CTAN hosts.
%%%
%%% The checksum field above contains a CRC-16
%%% checksum as the first value, followed by the
%%% equivalent of the standard UNIX wc (word
%%% count) utility output of lines, words, and
%%% characters. This is produced by Robert
%%% Solovay's checksum utility."
%%% }
%%% ====================================================================
%%%
%%% Thanks go to:
%%% Nelson Beebe
%%%
%=======================================================================
% Acknowledgement abbreviations:
%=======================================================================
% Institutional abbreviations:
@String{inst-STAN-CS = "Stanford University, Department of
Computer Science"}
%=======================================================================
% Journal abbreviations:
@string{j-ACM = "Journal of the ACM"}
@String{j-ACM-ADALET = "ACM Ada Letters"}
@String{j-ACM-COMPREV = "ACM Computing Reviews"}
@String{j-ACM-COMP-SURVEYS = "ACM Computing Surveys"}
@String{j-APL-QUOTE-QUAD = "APL Quote Quad"}
@String{j-CACM = "Communications of the ACM"}
@String{j-CCCUJ = "C/C++ Users Journal"}
@String{j-COMP-J = "The Computer Journal"}
@String{j-COMP-LANG-MAG = "Computer Language Magazine"}
@String{j-COMPUT-STAT-Q = "Computational Statistics Quarterly"}
@String{j-COMPUTER = "Computer"}
@string{j-DC = "Distributed Computing"}
@String{j-DDJ = "Dr. Dobb's Journal of Software Tools"}
@String{j-IEEE-ASSP-MAG = "IEEE ASSP magazine: a publication of the
IEEE Acoustics, Speech, and Signal Processing
Society"}
@String{j-IEEE-SOFTWARE = "IEEE Software"}
@string{j-IEEE-COMPUTER = "IEEE Computer"}
@String{j-IEEE-TRANS-SOFTW-ENG = "IEEE Transactions on Software Engineering"}
@string{j-IEEE-TRANS-COMP = "IEEE Transactions on Computers"}
@String{j-IFIP-TRANS-A = "IFIP Transactions. A. Computer Science and
Technology"}
@String{j-INFO-PROC-SOC-JAPAN = "Journal of the Information Processing
Society of Japan = Joho Shori"}
@string{j-IPL = "Information Processing Letters"}
@String{j-INFORMATIE = "Informatie"}
@String{j-IS = "Informatik Spektrum"}
@String{j-J-COMP-SCI-TECH = "Journal of Computer Science and Technology"}
@String{j-J-OOP = "Journal of Object Oriented Programming"}
@String{j-LINUX-JOURNAL = "Linux Journal"}
@String{j-RS-MAGAZINE = "RS\slash Magazine"}
@String{j-SEJ = "Software Engineering Journal"}
@String{j-SIGCSE = "SIGCSE Bulletin (ACM Special Interest Group
on Computer Science Education)"}
@String{j-SIGPLAN = "ACM SIGPLAN Notices"}
@String{j-SOFTWARE-CONCEPTS-TOOLS = "Software --- Concepts and Tools"}
@String{j-SPE = "Soft{\-}ware\emdash Prac{\-}tice
and Experience"}
@String{j-STRUCT-PROGRAM = "Structured Programming"}
@String{j-SUNEXPERT = "SunExpert"}
@String{j-TEXHAX = "{\TeX{}{\-}hax}"}
@String{j-TEXNIQUES = "{\TeX{}}{\-}niques, Publications for
the {\TeX{}} community"}
@String{j-TOPLAS = "ACM Transactions on Programming
Languages and Systems"}
@String{j-TOCS = "ACM Transactions on Computer Systems"}
@String{j-TUGBOAT = "{\TUB{}}"}
%=======================================================================
% Proceedings abbreviations:
@string{pro-ftcs85 = "Proceedings of the 15th IEEE Symposium on Fault Tolerant
Computing Systems (FTCS-15)"}
@string{pro-ftcs93 = "Proceedings of the 23rd IEEE Symposium on Fault Tolerant
Computing Systems (FTCS-23)"}
@string{pro-ftcs96 = "Proceedings of the 26th IEEE Symposium on Fault Tolerant
Computing Systems (FTCS-26)"}
@string{pro-ftcs97 = "Proceedings of the 27th IEEE Symposium on Fault Tolerant
Computing Systems (FTCS-27)"}
@string{pro-ftcs98 = "Proceedings of the 28th IEEE Symposium on Fault Tolerant
Computing Systems (FTCS-28)"}
@string{pro-ftcs98-fastabs = "Digest of FastAbstracts of the 28th IEEE
Symposium on Fault Tolerant Computing Systems
(FTCS-28)"}
@string{pro-wdag89 = "Proceedings of the
3rd International Workshop on Distributed Algorithms (WDAG89)"}
@string{pro-wdag90 = "Proceedings of the
4th International Workshop on Distributed Algorithms (WDAG90)"}
@string{pro-wdag91 = "Proceedings of the
5th International Workshop on Distributed Algorithms (WDAG91)"}
@string{pro-wdag92 = "Proceedings of the
6th International Workshop on Distributed Algorithms (WDAG92)"}
@string{pro-wdag93 = "Proceedings of the
7th International Workshop on Distributed Algorithms (WDAG93)"}
@string{pro-wdag94 = "Proceedings of the
8th International Workshop on Distributed Algorithms (WDAG94)"}
@string{pro-wdag95 = "Proceedings of the
9th International Workshop on Distributed Algorithms (WDAG95)"}
@string{pro-wdag96 = "Proceedings of the
10th International Workshop on Distributed Algorithms (WDAG96)"}
@string{pro-wdag97 = "Proceedings of the
11th International Workshop on Distributed Algorithms (WDAG97)"}
@string{pro-podc84 = "Proceedings of the 3rd
Annual ACM Symposium on Principles of Distributed
Computing (PODC'84)"}
@string{pro-podc90 = "Proceedings of the 9th
Annual ACM Symposium on Principles of Distributed
Computing (PODC'90)"}
@string{pro-podc91 = "Proceedings of the 10th
Annual ACM Symposium on Principles of Distributed
Computing (PODC'91)"}
@string{pro-podc92 = "Proceedings of the 11th
Annual ACM Symposium on Principles of Distributed
Computing (PODC'92)"}
@string{pro-podc93 = "Proceedings of the 12th
Annual ACM Symposium on Principles of Distributed
Computing (PODC'93)"}
@string{pro-podc94 = "Proceedings of the 13th
Annual ACM Symposium on Principles of Distributed
Computing (PODC'94)"}
@string{pro-podc95 = "Proceedings of the 14th
Annual ACM Symposium on Principles of Distributed
Computing (PODC'95)"}
@string{pro-podc96 = "Proceedings of the 15th
Annual ACM Symposium on Principles of Distributed
Computing (PODC'96)"}
@string{pro-podc97 = "Proceedings of the 16th
Annual ACM Symposium on Principles of Distributed
Computing (PODC97)"}
@string{pro-podc98 = "Proceedings of the 17th
Annual ACM Symposium on Principles of Distributed
Computing (PODC'98)"}
@string{pro-podc99 = "Proceedings of the 18th
Annual ACM Symposium on Principles of Distributed
Computing (PODC'99)"}
@string{pro-srds91 = "Proceedings of the 10th IEEE
Symposium on Reliable Distributed Systems (SRDS91)"}
@string{pro-srds92 = "Proceedings of the 11th IEEE
Symposium on Reliable Distributed Systems (SRDS92)"}
@string{pro-srds94 = "Proceedings of the 13th IEEE
Symposium on Reliable Distributed Systems (SRDS94)"}
@string{pro-srds95 = "Proceedings of the 14th IEEE
Symposium on Reliable Distributed Systems (SRDS95)"}
@string{pro-srds2000 = "Proceedings of the 19th IEEE
Symposium on Reliable Distributed Systems (SRDS2000)"}
@string{pro-wss95 = "Proceedings of the 2nd Workshop
on Self-Stabilizing Systems"}
@string{pro-wss97 = "Proceedings of the 3rd Workshop
on Self-Stabilizing Systems"}
@string{pro-wss99 = "Proceedings of the 19th IEEE International Conference
on Distributed Computing Systems Workshop on
Self-Stabilizing Systems"}
@string{pro-icdcs94 = "Proceedings of the 14th IEEE International
Conference on Distributed Computing Systems (ICDCS94)"}
@string{pro-icdcs96 = "Proceedings of the 16th IEEE International
Conference on Distributed Computing Systems (ICDCS96)"}
@string{pro-icdcs98 = "Proceedings of the 18th IEEE International
Conference on Distributed Computing Systems (ICDCS98)"}
@string{pro-icdcs99 = "Proceedings of the 19th IEEE International
Conference on Distributed Computing Systems (ICDCS99)"}
@string{asa = " International Symposium on Agent Systems and Applications"}
@string{ma = " International Symposium on Mobile Agents"}
@string{asama2000 = "Proceedings of the " # "Second" # asa # " and Fourth" # ma #
" (ASA/MA2000)"}
%=======================================================================
% Publishers and their addresses:
@String{pub-ACM = "ACM Press, New York"}
@String{pub-ACM:adr = "New York, NY 10036, USA"}
@String{pub-AW = "Ad{\-d}i{\-s}on-Wes{\-l}ey, Reading, MA"}
@String{pub-AW:adr = "Reading, MA, USA"}
@String{pub-BENCUM = "Benjamin/Cummings Pub. Co."}
@String{pub-BENCUM:adr = "Redwood City, CA, USA"}
@String{pub-IEEE = "IEEE Computer Society Press"}
@String{pub-IEEE-CSP = "IEEE Computer Society Press"}
@String{pub-IEEE-CSP:adr = "Los Alamitos, CA, USA"}
@String{pub-IEEE:adr = "1109 Spring Street, Suite 300, Silver
Spring, MD 20910, USA"}
@String{pub-ITCP = "International Thomson Computer Press"}
@String{pub-ITCP:adr = "20 Park Plaza Suite 1001, Boston,
MA 02116 USA"}
@String{pub-ITP = "International Thomson Publishing"}
@String{pub-ITP:adr = "5101 Madison Road, Cincinnati, OH
45227, USA"}
@String{pub-MH = "McGraw-Hill"}
@String{pub-MH:adr = "New York, NY, USA"}
@String{pub-MIT = "MIT Press"}
@String{pub-MIT:adr = "Cambridge, MA, USA"}
@String{pub-PH = "Pren{\-}tice-Hall"}
@String{pub-PH:adr = "Englewood Cliffs, NJ, USA"}
@String{pub-SUCSLI = "Stanford University Center for the
Study of Language and Information"}
@String{pub-SUCSLI:adr = "Stanford, CA, USA"}
@String{pub-SV = "Spring{\-}er-Ver{\-}lag"}
@String{pub-SV:adr = "Berlin, Germany~/ Heidelberg,
Germany~/ London, UK~/ etc."}
@String{pub-TEXPLORATOR = "The {\TeX}plorators Corporation"}
@String{pub-TEXPLORATOR:adr = "3701 W. Alabama, Suite 450-273,
Houston, TX 77027, USA"}
@String{pub-USENIX = "USENIX"}
@String{pub-USENIX:adr = "Berkeley, CA, USA"}
@String{pub-VNR = "Van Nostrand Reinhold"}
@String{pub-VNR:adr = "New York, NY, USA"}
@String{pub-WORLD-SCI = "World Scientific Publishing
Co. Pte. Ltd."}
@String{pub-WORLD-SCI:adr = "P. O. Box 128, Farrer Road,
Singapore 9128"}
%=======================================================================
% Series abbreviations:
@String{ser-LNCS = "Lecture Notes in Computer Science"}
%=======================================================================
% Bibliography entries.
@InProceedings{Floyd:1967:AMP,
author = "R. W. Floyd",
title = "Assigning meaning to programs",
editor = "J. T. Schwartz",
booktitle = "Mathematical aspects of computer science: Proc.
American Mathematics Soc. symposia",
year = "1967",
volume = "19",
pages = "19--31",
address = "Providence RI",
publisher = "American Mathematical Society",
annote = "[to get] first idea of termination function to prove
termination of algorithms."
}
@ARTICLE{Dijkstra:1974:SSS,
AUTHOR = "Edsger W. Dijkstra",
TITLE = "Self stabilizing systems in spite of distributed
control",
JOURNAL = j-CACM,
VOLUME = 17,
NUMBER = 11,
YEAR = 1974,
PAGES = "643--644",
annote = "Standard reference to the introduction of the notion
of self-stabilization into computer science."
}
@Article{Manna:1974:AAT,
author = "Zohar Manna and Amir Pnueli",
title = "Axiomatic approach to total correctness of programs",
journal = "Acta Informatica",
volume = "3",
pages = "243--263",
year = "1974",
annote = "[to get] Call termination function ``convergence function''."
}
@Book{Niemann:1974:MDM,
author = {H. Niemann},
ALTeditor = {},
title = {{Methoden der Mustererkennung}},
publisher = {Akademische Verlagsgesellschaft},
year = {1974},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Frankfurt},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {[Angabe von Armin]}
}
@Article{Dijkstra:1975:GCN,
author = {Edsger W. Dijkstra},
title = "Guarded commands, nondeterminacy, and formal
derivation of programs",
journal = j-CACM,
year = 1975,
volume = 18,
number = 8,
month = aug,
pages = "453--457",
OPTannote = {}
}
@Article{Katz:1975:CLT,
author = "Shmuel M. Katz and Zohar Manna",
month = dec,
year = "1975",
title = "A closer look at termination",
journal = "Acta Informatica",
volume = "5",
number = "4",
pages = "333--352",
annote = "[to get] A comparison of four termination proving methods."
}
@Article{Avizienis:1976:FTS,
author = "Algirdas Avi\v{z}ienis",
title = "Fault-tolerant systems",
OPTcrossref = "",
OPTkey = "",
journal = j-IEEE-TRANS-COMP,
year = "1976",
volume = "25",
number = "12",
pages = "1304--1312",
month = dec,
OPTnote = "",
annote = "This is a good and surprisingly advanced survey of
fault tolerance issues (mainly in hardware) as of
1976. The main points include comparing the
traditional `fault intolerant' approach which aims
on taking only the most reliable components and
putting them together without employing redundancy
and relying on manual maintenance in case of
failures, with the fault tolerant approach, which
uses protective redundancy. While the former can be
less costly in many situations, the latter is source
for higher dependapbility figures and has
psychological advantages if human lives could be
endangered by the system. However, the two
approaches are complementary! Furthermore,
Avizienis describes three aspects of fault tolerance
that have to be dealt with: (1) identification and
characterization of the fault set to be tolerated,
(2) development and choice of redundancy techniques,
(3) analytic or experimental prediction of the
effectiveness of the techniques. He also classifies
faults by duration, extent and value, and identifies
three forms of redundancy: hardware, software and
time. He gives a first notion of the two necessary
steps of detection and correction (see
\cite{Arora:1998:CDM}) and a lot of examples of
fault tolerant systems up to the year 1976. Overall,
a rich and despite its age still insight-heavy
paper."
}
@Article{Denning:1976:LMS,
author = "Dorothy E. Denning",
title = "A Lattice Model of Secure Information Flow",
journal = j-CACM,
volume = "19",
number = "5",
pages = "236--243",
month = may,
year = "1976",
OPTnote = "Papers from the Fifth ACM Symposium on Operating
Systems Principles (Univ. Texas, Austin, Tex., 1975).",
abstract = "Mechanisms that guarantee secure information flow in a
computer system are discussed. These mechanisms are
examined within a mathematical framework suitable for
formulating the requirements of secure information flow
among security classes. The central component of the
model is a lattice structure derived from the security
classes and justified by the semantics of information
flow. The model provides a unifying view of all systems
that restrict information flow, enables a
classification of them according to security
objectives, and suggests some new approaches. It also
leads to the construction of automatic program
certification mechanisms for verifying the secure flow
of information through a program.",
keywords = "computer operating systems; data processing; lattice;
mathematical models; program certification; secure
information flow; security; security classes; security
of data",
treatment = "A Application; T Theoretical or Mathematical",
annote = "[to read]"
}
@Book{Dijkstra:1976:DP,
author = {Edsger W. Dijkstra},
title = {A Discipline of Programming},
publisher = {Prentice-Hall},
year = {1976},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {Great book.}
}
@Article{Lamport:1977:PCM,
author = {Leslie Lamport},
title = {Proving the correctness of multiprocess programs},
journal = j-IEEE-TRANS-SOFTW-ENG,
year = 1977,
OPTkey = {},
volume = "3",
number = 2,
month = mar,
pages = "125--143",
annote = "First definition of terms ``safety'' and
``liveness''. What else?"
}
@InProceedings{Pnueli:1977:TLP,
author = "Amir Pnueli",
title = "The temporal logic of programs",
booktitle = "Proceedings of the 18th IEEE Symposium on the
Foundations of Computer Science (FOCS-77)",
address = "Providence, Rhode Island",
publisher = "IEEE Computer Society Press",
organization = "IEEE",
month = oct # " 31--" # nov # " 2",
year = "1977",
pages = "46--57",
annote = "[to read] Presents the idea of reactive systems and
temporal logic in contrast to transformationel
systems using Hoare Logic."
}
@InProceedings{Bartlett:1978:ANO,
author = "J. F. Bartlett",
title = "A {``NonStop''} operating system",
booktitle = "Proceedings of the 11th Hawaii International Conference on System Sciences",
volume = "3",
year = "1978",
annote = "description of TANDEM system.",
}
@Article{Lamport:1978:TCO,
author = {Leslie Lamport},
title = {Time, clocks and the ordering of events in a
distributed system},
journal = j-CACM,
year = 1978,
OPTkey = {},
volume = {21},
number = {7},
month = jul,
pages = {558--565},
OPTnote = {},
annote = "A famous and well-readable paper on causality and
possible causal dependencies in distributed
systems. Lamport is first to introduce the
``happended before'' relation (which corresponds to
causality) and proposes the use of logical time
instead of real time in distributed systems. He
characterises the relation as being a partial order
and shows how his logical time can be used to do
mutual exclusion. Work has subsequently lead to
vector time (Fidge/Mattern, cite?)."
}
@Article{Wensley:1978:SDA,
author = "J. H. Wensley and L. Lamport and J. Goldberg and M. W.
Green and K. N. Levitt and P. M. Melliar-Smith and R.
E. Shostak and C. B. Weinstock",
title = "{SIFT}: Design and analysis of a fault-tolerant
computer for aircraft control",
journal = "Proceedings of the IEEE",
volume = "66",
number = "10",
month = oct,
year = "1978",
pages = "1240--1255",
annote = "[to read]"
}
@InProceedings{Lamport:1980:SSN,
author = "Leslie Lamport",
title = "`{Sometimes}' is sometimes `not never'",
booktitle = "Proceedings of SIGPLAN-80, 7th ACM Symposium on
Principles of Programming Languages",
address = "Las Vegas, Nevada",
year = "1980",
pages = "174--185",
annote = "Discusses a difference between branching time and
linear time notions of temporal logic. In linear
time `not eventually $\neg\phi$' is equivalent to
`always $\phi$'. This is not true in branching
time. Lamport discusses the assumptions made by
computer scientists about temporal properties:
``The logic of linear time was used by Pnueli
[...], while the logic of branching time seems
to be the one used by most computer scientists
for reasoning about temporal concepts.'' As every
paper by Lamport, extremely well readable stuff!"
}
@Article{Pease:1980:RAP,
author = "M. Pease and R. Shostak and L. Lamport",
title = "Reaching Agreements in the Presence of Faults",
journal = "Journal of the ACM",
volume = "27",
number = "2",
pages = "228--234",
month = apr,
year = "1980",
annote = "This paper is similar to their 1982 publication
\cite{Lamport:1982:BGP}, but contains a rigorous proof
of the impossibility of Byzantine agreement for the
case $n=3$, $t=1$. As usual, $n$ is the total number of
processes and $t$ is the number of faulty processes.",
}
@Book{Burris:1981:CUA,
author = {Stanley N. Burris and H. P. Sankappanavar},
ALTeditor = {},
title = {A course in universal algebra},
publisher = pub-SV,
year = {1981},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
note = {Revised edition online at
\url{http://thoralf.uwaterloo.ca/htdocs/ualg.html}},
OPTannote = {}
}
@Book{Gries:1981:SP,
author = {David Gries},
title = {The Science of Programming},
publisher = pub-SV,
year = {1981},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
OPTannote = {}
}
@Article{Pnueli:1981:TSC,
author = {Amir Pnueli},
title = {The temporal semantics of concurrent programs},
journal = {Theoretical Computer Science},
year = {1981},
OPTkey = {},
volume = {13},
OPTnumber = {},
pages = {45--60},
OPTmonth = {},
OPTnote = {},
annote = {The semantics of a concurrent program specifies the
set of execution sequences which are admissible as proper execution
sequences of the program. Two main things must hold: (1) every state
is obtained from its predecessor by execution a single enabled atomic
action in one process, (2) no process which is infinitely often
enabled will be infinitely often delayed (strong fairness). With
this type of semantics one can introduce temporal operators ``always''
and ``eventually'' which can be used to precisely reformulate the
usual program properties like termination, partial and total correctness,
deadlock/starvation freedom etc. Also, proving that a program
possesses some property reduces to proving a set inclusion. The logic
still contains a ``next state'' operator which is argued against
by Lamport in \cite{Lamport:1983:WGT} because it doesn't support
hierachric proofs. Lamport regards this paper as the first to consider
identifying programs with execution sequences and thus place programs
and specifications onto the same formal level \cite{Abadi:1993:CS}.}
}
@TechReport{Rabin:1981:HES,
author = "M. Rabin",
title = "How to exchange secrets by oblivious transfer",
institution = "Harvard Aiken Computation Laboratory",
number = "TR-81",
year = "1981",
annote = "A probabilistic exchange protocol similar to
\cite{Blum:1983:HES}. [to get]"
}
@Article{Chang:1982:EAD,
author = {E. J.-H. Chang},
title = {Echo algorithms: {Depth} parallel operations on
general graphs},
journal = j-IEEE-TRANS-SOFTW-ENG,
year = {1982},
OPTkey = {},
volume = {SE-8},
OPTnumber = {},
pages = {391--401},
OPTmonth = {},
OPTnote = {},
annote = {[to get] Reference to Echo algorithm}
}
@InCollection{Girault:1982:PPC,
author = {C. Girault},
title = {Proof of protocols in the case of failures},
booktitle = {Parallel processing systems. An advanced course},
OPTcrossref = {},
OPTkey = {},
pages = {121--139},
publisher = {Cambridge University Press},
year = {1982},
editor = {J. Evans},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTtype = {},
OPTchapter = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {[to read]}
}
@Article{Lamport:1982:BGP,
author = "L. Lamport and R. Shostak and M. Pease",
title = "The {Byzantine} generals problem",
OPTcrossref = "",
OPTkey = "",
journal = j-TOPLAS,
year = "1982",
volume = "4",
number = "3",
pages = "382--401",
month = jul,
OPTnote = "",
annote = "This is one of the all time classic papers in fault
tolerant distributed computing: the Byzantine
Generals Problem (BGP) is presented and scenarios
are discussed where it is solvable and
unsolvable. The BGP consists of a set of nodes in a
completely connected network, one of which is called
the commander and all others are lieutenants. There
can be a certain number m of traitors in the set of
nodes. The problem is that the commander sends an
order to all lieutenants and (1) all lieutenants
must obey the same order, and (2) if the commander
is not a traitor then every other non-traitor obeys
the order he sends. The real world scenarios where
this problem exists are those where a set of
replicated processors must act in unison despite the
fact that all get different input (high reliability
systems). It turns out that the problem is
unsolvable if there are no more than 3m nodes in the
network. If messages can be signed, then it remains
unsolvable if half the nodes can be traitors. On the
other hand, if there are 3m+1 nodes (or 2n+1
respectively), then the BGP is solvable. Two
algorithms are given. They are presented and proved
in a recursive/inductive fashion which is quite
stunning. The authors remark, that the problem is
unsolvable in asynchronous systems (where there is
no possibility of implementing synchronized clocks
in the presence of faults). Also, there algorthm for
the 3m+1 case seems to be optimal although it
requires a message path of m+1 and has a high
message complexity. The authors argue that extremely
high reliability has its cost. Byzantine behaviour
is implicitly modeled by always choosing the worst
choice, or considering all choices and choosing the
worst."
}
@InProceedings{Ben-Or:1983:AAF,
author = "Michael Ben-Or",
title = "Another Advantage of Free Choice: Completely
Asynchronous Agreement Protocols",
booktitle = "Proc. Second Ann. ACM Symp. on Principles of
Distributed Computing",
year = "1983",
pages = "27--30",
annote = "Ben-Or's probabilistic algorithm for asynchronous
Byzantine agreement, discussed in
Section~\ref{sec-byzantine}, was one of the first
published solution to the problem, and remains the
simplest. Processes toss coins independently to reach
consensus on a value. His algorithm requires that less
than one-fifth of the processes are faulty for
correctness to be guaranteed. The expected number of
rounds is exponential in the number of processes $n$,
but becomes a constant when the number of faulty
processes is $O(\sqrt{n})$.",
}
@Article{Blum:1983:HES,
author = "Manuel Blum",
title = "How to Exchange (Secret) Keys",
journal = "ACM Transactions on Computer Systems",
volume = "1",
number = "2",
pages = "175--193",
month = may,
year = "1983",
bibdate = "Thu Jan 14 11:57:59 1999",
note = "Previously published in ACM STOC '83 proceedings,
pages 440--447.",
annote = "A protocol is presented to fairly exchange secrets using
number theoretic means. Two parties, Alice and Bob, are assumed to
have equal computing capabilities and equal knowledge of
algorithms. There is no need for a trusted intermediary and no need
for a judge outside of the system. There is a negligible probability
of cheating. The idea is to use gradual exchange and after
exchanging an individual bit, do some sort of zero-knowledge-proof
to witness that the bit is actually a valid bit. This is done by a
complicated challenge response type of method which I do not
understand (quadratic residues, etc. involved). The probability that
either can cheat the protocol can be made arbitrarily
small. However, the usual problems with gradual exchange protocols
still exist. Section 13 presents some interesting ideas regarding
pricing of gradually exchanged bits. Claims to be similar to
an early TR of Rabin \cite{Rabin:1981:HES}."
}
@Article{Lamport:1983:SCP,
author = {Leslie Lamport},
title = {Specifying concurrent program modules},
journal = j-TOPLAS,
year = {1983},
OPTkey = {},
OPTvolume = {5},
OPTnumber = {2},
OPTpages = {190--222},
OPTmonth = apr,
OPTnote = {},
OPTannote = {to get}
}
@InProceedings{Lamport:1983:WGT,
author = "{Leslie Lamport}",
title = "What good is Temporal Logic?",
booktitle = "Proceedings of the {IFIP} Congress on Information
Processing",
year = "1983",
editor = "{R. E. A. Mason}",
pages = "657--667",
publisher = "North-Holland",
address = "Amsterdam",
annote = "This a more informal and easy going introduction into the
merits of temporal logic than \cite{Lamport:1983:SCP}, much in the
spirit of a later and more refined exposition
\cite{Lamport:1989:SAS}. Lamport proposes a formal language
because ``natural languages are very expressive and very
imprecise'' while ``formal languages are not very expressive but
very precise.'' The distinction is again that of safety and
liveness properties, where safety properties can be used to reason
about real-time behavior if the notion of a clock is added. The
concept of stuttering is motivated and other temporal logic
formalisms as of 1983 are briefly surveyed. Finally, Lamport
elaborates on the hierarchy of programming languages, starting
from high level specifications and ending at the quantum level of
electrons. Temporal logic can provide a framework for reasoning at
all these levels.",
}
@Article{Schlichting:1983:FSP,
author = "Richard D. Schlichting and Fred B. Schneider",
title = "Fail stop processors: {An} approach to designing
fault-tolerant computing systems",
OPTcrossref = "",
OPTkey = "",
journal = j-TOCS,
year = "1983",
volume = "1",
number = "3",
pages = "222--238",
month = aug,
OPTnote = "",
annote = "[to read]"
}
@Book{Strohrmann:1983:AMM,
author = {G. Strohrmann},
ALTeditor = {},
title = {{Anlagensicherung mit Mitteln der MSR-Technik}},
publisher = {Oldenburg},
year = {1983},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {M\"unchen},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {[Angabe von Armin]}
}
@InProceedings{Broder:1984:FCM,
title = "Flipping coins in many pockets ({Byzantine} agreement
on uniformly random values)",
author = "Andrei Z. Broder and Danny Dolev",
pages = "157--170",
booktitle = "25th Annual Symposium on Foundations of Computer
Science",
month = "24--26 " # oct,
year = "1984",
address = "Singer Island, Florida",
organization = "IEEE",
annote = "Discusses randomized Byzantine agreement where a set of
processes agree on a common bit using a random coin. Gives algorithm
which works if the faulty processes are not the majority. Extends
the impossibility result for deterministic consensus by showing that
there is no Byzantine agreement protocol tolerant against $t$
fail-stop faults that works in less than $t+1$ rounds."
}
@Article{Dijkstra:1983:DTD,
author = "Edsger W. Dijkstra and W. H. J. Feijen and A. J. M.
{van Gasteren}",
title = "Derivation of a Termination Detection Algorithm for
Distributed Computations",
journal = "Information Processing Letters",
volume = "16",
number = "5",
pages = "217--219",
day = "10",
month = jun,
year = "1983",
coden = "IFPLAT",
ISSN = "0020-0190",
mrclass = "68B05 (68C05)",
mrnumber = "84m:68005",
bibdate = "Wed Nov 11 12:16:26 MST 1998",
acknowledgement = ack-nhfb,
classification = "723; B6210L (Computer communications); C5620
(Computer networks and techniques); C6150J (Operating
systems)",
corpsource = "Burroughs, AL Nuenen, Netherlands",
journalabr = "Inf Process Lett",
keywords = "computer programming; distributed computations;
distributed processing; networks; protocols;
termination detection algorithm",
treatment = "P Practical",
}
@InCollection{Echtle:1984:FSV,
author = "Klaus Echtle",
title = "{Fehlermodellierung} bei {Simulation} und
{Verifikation} {von} {Fehlertoleranz-Algorithmen}
{f\"ur} {Verteilte} {Systeme}",
OPTcrossref = "",
OPTkey = "",
booktitle = "{Software-Fehlertoleranz} und {-Zuverl\"assigkeit}",
publisher = pub-SV,
year = "1984",
editor = "F. Belli and S. Pfleger and M. Seifert",
OPTvolume = "",
number = "83",
series = "Informatik-Fachberichte",
OPTtype = "",
OPTchapter = "",
pages = "73--88",
OPTaddress = "",
OPTedition = "",
OPTmonth = "",
OPTnote = "(in German)",
annote = "Two types of fault models are described and
compared: low level fault specifications (LLFS,
`aufz{\"a}hlendes Fehlermodell') and high level fault
specifications (HLFS, `spezifizierendes
Fehlermodell'). LLFS consist of a detailed
description of what type of faults may happen and
when/where they are supposed to occur (e.g., send
omission etc.). They are well suited for simulation
and testing. HLFS are a high level description of
how a node's behavior changes in the presence of
faults. This is expressed at the interfaces between
nodes of a distributed system: usually the actions
at an interface reflect certain requirements of a
protocol specifications. The occurence of faults at
a node weaken these requirements. To an extreme
(Byzantine behavior), there are no restrictions on
what might happen at an interface. HLFS influence
interface specifications and are only suited for
verification purposes. Both LLFS and HLFS are
compared according to their suitability for
verification. Finally, the importance of hierarchic
fault modelling is stressed to master
complexity. This can be seen as an early predecessor
of the concept of multitolerance
\cite{Arora:1998:CDM}."
}
@PhdThesis{Hadzilacos:1984:IFT,
author = "Vassos Hadzilacos",
title = "Issues of Fault Tolerance in Concurrent Computations",
school = "Harvard University",
year = "1984",
OPTcrossref = "",
OPTkey = "",
OPTaddress = "",
OPTmonth = "",
OPTtype = "",
note = "also published as Technical Report TR11-84.",
annote = "First mentioning of send omission type of
faults. Reference found in
\cite{Schneider:1993:WGM,Hadzilacos:1994:MAF}."
}
@Article{Lamport:1984:UTI,
author = "Leslie Lamport",
title = "Using Time Instead of Timeout for Fault-Tolerant
Distributed Systems",
journal = j-TOPLAS,
volume = "6",
number = "2",
year = "1984",
month = apr,
annote = "[not by me:] processes are synchronized by clocks,
and the clocks
are synchronized using the Byzantine Generals solution.
Time intervals are used. [to get]",
}
@Article{Lundelius:1984:ULB,
title = "An Upper and Lower Bound for Clock Synchronization",
author = "Jennifer Lundelius and Nancy Lynch",
pages = "190--204",
journal = "Information and Control",
month = aug # "/" # sep,
year = "1984",
volume = "62",
number = "2/3",
annote = "Prove a result similar to \cite{Dolev:1986:PIA}: The
clocks of $n$ processes cannot be deterministically synchronized
more closely than $e(1-1/n)$, where $e$ is the un certainty of message
delivery times. The assumptions are clocks running at the same
speed but initialized differently, the given uncertainty $e$,
and no failures. The graph is completely connected. The result
states how close clock values can be at the same real time,
wheras \cite{Dolev:1986:PIA} characterize how close the
real times can be when clocks show the same value. The idea of
the proof is to construct runs which look the same to the
processes but result in different clock values/real times at
different points."
}
@TechReport{Shah:1984:DSS,
pages = "14",
year = "1984",
type = "Technical Report",
number = "TR84-624",
title = "Distributed Snapshots In Spite of Failures",
author = "Amitabh Shah and Sam Toueg",
abstract = "An extension of the Chandy-Lamport algorithm
([Chan84]) to find global states of distributed systems
is presented where benign failures of processes and
channels are permitted. The scope of the algorithm in
detecting stable properties in distributed systems is
discussed. As an application, an algorithm to detect
deadlocks in failure-prone distributed systems is
presented.",
institution = "Cornell University, Computer Science Department",
month = jul,
notes = "Revised February 1985",
annote = "Extends the Chandy-Lamport snapshot algorithm
\cite{Chandy:1985:DSD} to deal with crash-recover faults and
message losses. The system model is the asynchronous one of
\cite{Chandy:1985:DSD} and the algorithm uses a simple timeout
mechanism to check the functional state of neighboring processes
(today we call this an unreliable failure detector). Channels are
FIFO and flushing messages are used just like in
\cite{Chandy:1985:DSD}. However, due to obvious impossibilities
the notion of a consistent cut must be weakened in a way that
includes uncertainty. Termination is guaranteed by the timeout
solution, but the result may be `uncertain' making it necessary to
restart the algorithm again. It doesn't seem to be guaranteed that
eventually a stable predicate is detected because of possible
channel failures (what about false suspicions and virtual
partitions?). Has this algorithm been published elsewhere?"
}
@Article{Spector:1984:SSP,
author = {Alfred Spector and David Gifford},
title = {The space shuttle primary computer system},
journal = j-CACM,
year = 1984,
OPTkey = {},
volume = 27,
number = 9,
OPTmonth = {},
pages = {874--900},
OPTnote = {},
annote = {A detailed description of the computer system that
runs the space shuttle.}
}
@Article{Alpern:1985:DL,
author = {Bowen Alpern and Fred B. Schneider},
title = {Defining liveness},
journal = j-IPL,
year = 1985,
OPTkey = {},
volume = 21,
OPTnumber = {},
OPTmonth = {},
pages = "181--185",
OPTnote = {},
annote = "Standard definitions of system properties, safety
and liveness. Shows that every nontrivial system
property can be expressed as an intersection of a
safety property and a liveness property. Terms
safety and liveness defined by Lamport
\cite{Lamport:1977:PCM}."
}
@Article{Bracha:1985:ACB,
author = "Gabriel Bracha and Sam Toueg",
title = "Asynchronous Consensus and Broadcast Protocols",
OPTcrossref = "",
OPTkey = "",
journal = j-ACM,
year = "1985",
volume = "32",
number = "4",
pages = "824--840",
month = oct,
OPTnote = "",
annote = "The authors investigate probabilistic consensus
protocols for ``FLP'' model
\cite{Fischer:1985:IDC}. Probabilities are
introduced by making assumptions about the message
subsystem, i.e. the probability that a node receives
a message from all non-faulty nodes can be
calculated. For the fail-stop model half of the
nodes may be faulty to still achieve consensus with
probability 1, for Byzantine faults at most one
third may be faulty. The relevant protocols and an
application to reliable broadcast are given."
}
@Article{Chandy:1985:DSD,
author = {K. M. Chandy and Leslie Lamport},
title = {Distributed snapshots: determining global states of
distributed systems},
journal = {ACM Transactions on Computing Systems},
year = {1985},
OPTkey = {},
volume = {3},
number = {1},
OPTmonth = {},
pages = {63--75},
OPTnote = {},
annote = {nicht kopiert}
}
@InProceedings{Coan:1985:DFS,
title = "The Distributed Firing Squad Problem (Preliminary
Version)",
author = "Brian A. Coan and Danny Dolev and Cynthia Dwork and
Larry Stockmeyer",
pages = "335--345",
booktitle = "Proceedings of the Seventeenth Annual {ACM} Symposium
on Theory of Computing",
month = "6--8 " # may,
year = "1985",
address = "Providence, Rhode Island",
annote = "[to read]"
}
@Article{Cristian:1985:RAF,
author = {Flaviu Cristian},
title = {A rigorous approach to fault-tolerant programming},
journal = j-IEEE-TRANS-SOFTW-ENG,
year = 1985,
OPTkey = {},
volume = 11,
number = 1,
month = jan,
pages = "23--31",
OPTnote = {},
annote = "First idea of defining faults as spontaneous actions
on an extended system space."
}
@Article{Fischer:1985:IDC,
author = {Michael J. Fischer and Nancy A. Lynch and Michael S.
Paterson},
title = {Impossibility of distributed consensus with one
faulty process},
journal = j-ACM,
year = 1985,
volume = 32,
number = 2,
month = apr,
pages = "374--382",
OPTnote = {},
OPTannote = "Landmark paper in fault-tolerant distributed
computing. The system considered is completely
asynchronous, nodes may stop by halting (crash
failure) but may not exhibit hostile (Byzantine)
behaviour, the message system is reliable with a
reliable broadcast primitive, no synchronized clocks
or the possibility to detect failures are
assumed. The authors show that every non-trivial
execution can go on forever without reaching a
result, because it is in effect not possible to
distinguish a crashed node from one that is merely
very slow. The proof is very detailed and is based
non non-constructive methods that produce a
contradiction from opposite assumptions. (Proof is
explained in other words in \cite{Turek:1992:MFC}.)"
}
@Article{Halpern:1985:OPP,
title = "Optimal Precision in the Presence of Uncertainty",
author = "Joseph Y. Halpern and Nimrod Megiddo and Ashfaq A.
Munshi",
pages = "170--196",
journal = "Journal of Complexity",
year = "1985",
month = dec,
volume = "1",
number = "2",
annote = "Analyzes the imprecision inherent in distributed
systems that have uncertain message delays. Takes the model of
\cite{Dolev:1986:PIA} and wants to execute coordinated actions
(instead of doing clock synchronization). Assumes that hardware
clocks run at the same rate, yet may be initialized differently,
and that messages have a maximum delivery delay. Basically enriches
the lower bound of \cite{Dolev:1986:PIA}, and states that
probabilistic algorithms can do no better (with certainty). Hmm,
see \cite{Cristian:1989:PCS}. Investigates the situation in
which there a Byzantine nodes. Gives an algothm to compute
optimal precision in cases without faults and bounded precision
in cases with faults."
}
@Book{Hoare:1984:CSP,
author = "C. A. R. Hoare",
title = "Communicating Sequential Processes",
publisher = "Prentice-Hall",
year = "1985",
}
@Article{Awerbuch:1985:CNS,
author = {Baruch Awerbuch},
title = {Complexity of Network Synchronization},
journal = {Journal of the ACM},
year = {1985},
OPTkey = {},
volume = {32},
number = {4},
pages = {804--823},
month = oct,
OPTnote = {},
OPTannote = {}
}
@InProceedings{Laprie:1985:DCF,
author = "J. C. Laprie",
title = "Dependable computing and fault tolerance: concepts
and terminology",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "2--11",
booktitle = pro-ftcs85,
year = "1985",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
month = jun,
OPTnote = "",
OPTannote = "[to read]"
}
@Article{Lamport:1985:SCP,
author = "Leslie Lamport and P. M. Melliar-Smith",
title = "Synchronizing Clocks in the Presence of Faults",
journal = "Journal of the ACM",
volume = "32",
number = "1",
pages = "52--78",
month = jan,
year = "1985",
url = "http://www.acm.org/pubs/toc/Abstracts/0004-5411/2457.html",
abstract = "Algorithms are described for maintaining clock
synchrony in a distributed multiprocess system where
each process has its own clock. These algorithms work
in the presence of arbitrary clock or process failures,
including ``two-faced clocks'' that present different
values to different processes. Two of the algorithms
require that fewer than one-third of the processes be
faulty. A third algorithm works if fewer than half the
processes are faulty, but requires digital
signatures.",
keywords = "algorithms; Byzantine failures; clocks, electric ---
Synchronization; computer programming --- Algorithms;
computer systems programming --- Multiprocessing
Programs; computer systems, digital; Fault Tolerant
Capability; interactive convergence algorithm;
reliability; theory; verification; Zeitliche Ordnung",
annote = "investigates Byzantine clock synchronization. Surveyed
in \cite{Ramanathan:1990:FCS}. [to get]"
}
@Article{Arora:1986:DTD,
author = {Rada Krishan Arora and S. P. Rana and M. N. Gupta},
title = {Distributed termination detection algorithm for
distributed computations},
journal = ipl,
year = 1986,
OPTkey = {},
volume = 22,
OPTnumber = {},
month = "May",
pages = "311--314",
annote = "See also \cite{Tan:1986:CDT,Arora:1988:MCD}."
}
@Article{Berglund:1986:IV,
author = "Eric J. Berglund",
title = "An introduction to the {V}-system",
journal = "IEEE Micro",
volume = "6",
number = "4",
pages = "35--52",
month = aug,
year = "1986",
annote = "[to read]"
}
@Article{Chandy:1986:HPL,
author = "K. M. Chandy and Jayadev Misra",
title = "How processes learn",
OPTcrossref = "",
OPTkey = "",
journal = j-DC,
year = "1986",
volume = "1",
OPTnumber = "",
pages = "40--52",
OPTmonth = "",
OPTnote = "",
annote = "A formal article on knowledge of processes and how
it is gained and lost. The notion of knowledge is
defined using the concept of isomorphism. Two system
computations are isomorphic with respect to a
process if the behaviour of the process is identical
in both computations. This means essentially that
``a process cannot distinguish between them''. A
fact that is valid in all indistinguishable
computations is said to be known by a
process. An important type of predicate is a local
predicate (which is affected merely by state changes
on one process). These results can be applied to
situations in which the question is asked: Is a
process unsure about a fact? These scenarios include
the impossibility to detect whether a process has
crashed. The theory is also applied to show that
there must be causal message chains in mutual
exclusion protocols and that the complexity of
termination detection is at least as large as the
message complexity of the underlying computation."
}
@InProceedings{Cleve:1986:LSC,
title = "Limits on the Security of Coin Flips when Half the
Processors Are Faulty (Extended Abstract)",
author = "Richard Cleve",
pages = "364--369",
booktitle = "Proceedings of the Eighteenth Annual {ACM} Symposium
on Theory of Computing",
month = "28--30 " # may,
year = "1986",
address = "Berkeley, California",
annote = "The 2-processor-bit-selection problem is to devise a
protocol between two processes $A$ and $B$ with the
following properties: $A$ and $B$ start with a random
bit value and after termination of the protocol both
processes output a value $a$ and $b$, respectively,
where $a=b$ (agreement). Processes internally have
access to a random variable. A weaker definition of
agreement states, that the probability that $a=b$
must be bounded from below by $1-O(1/n^k)$ where $n$
and $k$ are not clear to me. A
2-processor-bit-selection-scheme is secure if the
protocol achieves the weaker definition of agreement
(or better) even in the case where one process is
replaced by a faulty one. The author gives an
impossibility result stating that there exists no
secure 2-processor-bit-selection protocol (Section
2.2). (I didn't get the idea behind the proof.) This
result is extended to a definition of an
$s$-processor-bit-selection scheme. The new result
states that it is impossible to reach (weak)
agreement if $\lceil s/2 \rceil$ of the processors
are faulty. The paper must be seen in the context of
probabilistic Byzantine agreement, I suppose."
}
@InProceedings{Cristian:1986:CSP,
author = "F. Cristian and H. Aghili and R. Strong",
title = "Clock Synchronization in the Presence of Omission and
Performance Faults",
booktitle = pro-ftcs86,
pages = "218--223",
publisher = pub-IEEE,
address = "Vienna, Austria",
year = "1986",
annote = "Revised version read as \cite{Cristian:1994:CSP}."
}
@Article{Dolev:1986:PIA,
author = {Danny Dolev and Joseph Y. Halpern and H. Raymond Strong},
title = {On the possibility and impossibility of achieving clock
synchronization},
journal = {Journal of Computer and System Sciences},
year = {1986},
OPTkey = {},
volume = {32},
number = {2},
pages = {230--250},
month = apr,
OPTnote = {},
annote = {The authors prove that clock synchronization is impossible
without authentication if at least one third of the processors are
faulty. They also give a lower bound on the precision of local clocks:
Define $U$ to be the maximum uncertainty in the network, i.e. the
maximum difference between minimum and maximum message transmission
time for any pair of directly connected processes. The imprecision
of local clocks is at least half the uncertainty, i.e. there is no
algorithm that synchronizes clocks of two adjacent processes closer
than $U/2$. An extended result appears in \cite{Halpern:1985:OPP}.
A result similar to this can be found in \cite{Lundelius:1984:ULB}
(see the discussion there).}
}
@Article{Dolev:1986:RAA,
author = "Danny Dolev and Nancy A. Lynch and Shlomit S. Pinter
and Eugene W. Stark and William E. Weihl",
title = "Reaching approximate agreement in the presence of faults",
OPTcrossref = "",
OPTkey = "",
journal = j-ACM,
year = "1986",
volume = "33",
number = "3",
pages = "499--516",
month = jul,
OPTnote = "",
annote = "[to read]"
}
@Article{Fischer:1986:EIP,
author = "Michael J. Fischer and Nancy A. Lynch and Michael
S. Paterson",
title = "Easy impossibility proofs for distributed consensus
problems",
OPTcrossref = "",
OPTkey = "",
journal = j-DC,
year = "1986",
volume = "1",
OPTnumber = "",
pages = "26--39",
OPTmonth = "",
OPTnote = "",
annote = "[to read]"
}
@Book{Francez:1986:F,
author = "Nissim Francez",
title = "Fairness",
publisher = pub-SV,
series = "Texts and Monographs in Computer Science",
year = "1986",
annote = "A thoughrough book on many notions of fairness in
scheduling concurrent actions. Possibly outdated
because many new notions seem to have appeared (any
references?"
}
@Article{Liskov:1986:SDP,
author = "Barbara Liskov and William Weihl",
title = "Specifications of Distributed Programs",
journal = j-DC,
publisher = pub-SV,
year = "1986",
volume = "1",
pages = "102--118",
annote = "An early advocate of having two seperate sets of
specifications: one for the normal operation and a weaker one for
``abnormal'' behavior (the tolerance specification of
\cite{Gaertner:1999:ESD}). The authors argue that this is user
friendly and also simplifies the specifications. Several examples
of such specifications are given (which I did not look at in
detail). The conclusions contain a somewhat misleading discussion
on why liveness is not the correct property to describe abnormal
behavior. Rather, the likelihood of abnormal behavior should be
specified (but this is a point of future work). At the end, the
authors indicate that having a tolerance specification eases the
understanding of implementation constraints and so a tolerance
specification is also of use to implementors. The tolerance
specification can be seen as the ``first refinement'' of the
original specification.",
}
@Article{Moses:1986:CHO,
author = "Yoram Moses and Danny Dolev and Joseph Y. Halpern",
title = "Cheating husbands and other stories: {A} case study of
knowledge, action, and communication",
OPTcrossref = "",
OPTkey = "",
journal = j-DC,
year = "1986",
volume = "1",
OPTnumber = "",
pages = "167--176",
OPTmonth = "",
OPTnote = "",
annote = "The authors again take the cheating husbands puzzle
to show subtle interactions between knowledge,
action, and communication in distributed
systems. They discuss the cases of asynchronous
communication, synchronous communication, weakly
synchronous communication with bound b, and
asymmetry in communication (ring topology). The
relationship to eventual common knowledge, common
knowledge and b-common knowledge are
shown. Moreover, in the synchronous case, faulty
nodes can compilcate the matter again (disobedient
wives). This paper is shorter and thus a little more
introductory that a later one
\cite{Halpern:1990:KCK}."
}
@Article{Myers:1986:CSF,
author = {W. Myers},
title = {Can software for the strategic defense initiative
ever be error free?},
journal = {IEEE Computer},
year = {1986},
OPTkey = {},
volume = {19},
number = {11},
OPTpages = {},
month = nov,
OPTnote = {},
annote = {Presents figure that there are about 3.3 software errors
per 1000 LoC. Peter G. Neumann comments on this in
`Inside Risks' in late 2000 CACM.}
}
@Article{Perry:1986:DAP,
title = "Distributed Agreement in the Presence of Processor and
Communication Faults",
author = "Kenneth J. Perry and Sam Toueg",
journal = j-IEEE-TRANS-SOFTW-ENG,
pages = "477--482",
month = mar,
year = "1986",
volume = "12",
number = "3",
annote = "First to define the general omission fault model
consisting of crash, send- and receive-omission
failures. [to get]"
}
@Article{Tan:1986:CDT,
author = {Richard B. Tan and Gerard Tel and Jan {van Leeuwen}},
title = {Comments on {``Distributed termination detection
algorithm for distributed computations''} ({Letter} to
the {Editor})},
journal = ipl,
year = 1986,
OPTkey = {},
volume = 23,
OPTnumber = {},
month = "October",
pages = "163",
annote = "Notes an error in the algorithm of \cite{Arora:1986:DTD}.
See also \cite{Arora:1988:MCD}."
}
@InProceedings{Attiya:1987:ACA,
author = "Hagit Attiya and Amotz Bar-Noy and Danny Dolev and
Daphne Koller and David Peleg and R{\"u}diger Reischuk",
title = "Achievable cases in an asynchronous environment",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "337--346",
booktitle = "Proceedings of the 28th annual Symposium on the
Foundations of Computer Science",
year = "1987",
OPTorganization = "",
publisher = pub-IEEE-CSP,
OPTaddress = pub-IEEE-CSP:adr,
month = oct,
OPTnote = "",
annote = "The authors consider several problems and show that
they are achievable in asynchronous systems despite
that fact that things like consensus
aren't. Problems considered are renaming of
processors to compact the name space and the
``multi-slot critical section problem'' (which is
multual exclusion for more than one processor."
}
@Book{Bernstein:1987:CCR,
author = {P. Bernstein and V. Hadzilacos and N. Goodman},
title = {Concurrency Control and Recovery in Database Systems},
publisher = pub-AW,
year = {1987},
OPTnote = {},
OPTannote = {H.2.5/Bern nicht am Ort}
}
@article{Birman:1987:RCP,
author ={K.P. Birman and T.A. Joseph},
title ={Reliable Communication in the Presence of Failures},
journal ={ACM Transactions on Computer Systems},
volume ={5},
number ={1},
month =feb,
year ={1995},
pages ={47--76},
annote ={First reference to causal order, the generalization of
Lamport's happened-before \cite{Lamport:1978:TCO}.}
}
@Article{Brooks:1987:NSB,
author = "Frederick P. Brooks",
title = "No Silver Bullet",
journal = j-IEEE-COMPUTER,
volume = "20",
number = "4",
pages = "10--19",
month = apr,
year = "1987",
annote = "A famous paper on the ``essence and accidents in
software engineering''. Brooks explores reasons for
the fact that despite high hopes and great claims
the software industry and computer science academia
has failed to produce really reliable, error-free
products. Brooks discusses facts like complexity and
psychological problems for people involved. He states
that the problem is in it's core rather a human than
a technical issue."
}
@Article{Dolev:1987:MSN,
author = "Danny Dolev and Cynthia Dwork and Larry Stockmeyer",
title = "On the minimal synchronism needed for distributed consensus",
OPTcrossref = "",
OPTkey = "",
journal = j-ACM,
year = "1987",
volume = "34",
number = "1",
pages = "77--97",
month = jan,
OPTnote = "",
annote = "This is a refinement work of the paper by Fischer,
Lynch and Paterson \cite{Fischer:1985:IDC}. The
consensus problem is investigated in various
different system models. Critical parameters that
emerge are: processors synchronous/asynchronous,
communication synchronous/asynchronous, message
order synchronous/asynchronous, broadcast
transmission or point to point, atomic receive/send
or separate receive/send. The minimal cases to
achieve consensus are: (1) synchronous processors
and synchronous communication, (2) synchronous
processors and synchronous message order, (3)
synchronous message order and broadcast
communication, (4) synchronous communication,
broadcast transmission, and atomic receive/send. The
intuition behind the results is that the system
shouldn't be able to ``hide a critical
step''. Probabilistic algorithms are not
investigated. The proofs in this paper are large and
quite intrinsic."
}
@Article{Jifeng:1987:ASP,
title = "Algebraic Specification and Proof of a Distributed
Recovery Algorithm",
author = "He Jifeng and C. A. R. Hoare",
journal = "Distributed Computing",
pages = "1--12",
year = "1987",
volume = "2",
number = "1",
annote = "A masking fault tolerant implementation to a crash-recover
process is presented and proved using basics of CSP. Two different
implementations are presented: one that simply replays and one that
uses checkpoints. Faults are detected instantaneously and ``the only
subtle point is to ensure the correct outcome even when [faults]
occur in the middle of the recovery procedure.'' (p. 2) I didn't
find this point though in the proof. The discussion makes a few good
points and contributes to the overall quality of the paper: (1)
instead of having a general purpose mechanism to prove any system,
every application area probably will have its adopted calculus:
``Nevertheless, even for a grossly over-simlified problem, the
algebraic calculations are non-trivial. This probably jas to be
accepted as inevitable in any serious application of mathematics to
engineering. The calculations can be simplified by prior development
of a calculus adapted more to the specifica needs of a problem. It
will be interesting to see how far such calculi are applicable to
mor general classes of problems; but it seems quite likely that they
will not. Again, we may have to accept that each application will
require derivation of specialized laws to control its complexity.''
(page 9) (2) recovery with non-instantaneous fault detection can
probably be based on logical time, (3) non-deterministic processes
cannot use this type of recovery. A weakened specification is
necessary."
}
@Article{Joseph:1987:PRF,
title = "Proof Rules for Fault Tolerant Distributed Programs",
author = "Mathai Joseph and Abha Moitra and Neelam
Soundararajan",
pages = "43--67",
journal = "Science of Computer Programming",
month = feb,
year = "1987",
volume = "8",
number = "1",
annote= "The authors attempt to develop a set of rules to prove the
correctness of CSP programs \cite{Hoare:1984:CSP} in faulty
environments. The failure model is that of fail-stop, i.e. the
authors assume detectable crash faults and recovery without stable
storage. The method concentrates on partial correctness of
terminating processes and also on the invariants of non-terminating
processes (i.e. it concentrates on safety properties). The proof
rules show how the interface (i.e. communication) behavior of
processes is weakened by the failure model and how the behavior of
the complete system can be obtained from the behaviors of the
individual processes. The recovery-aspect of the failure model
weakens the achievable safety property because of possible
repetitions. But overall, the global invariant is the conjunction of
the local invariants provided that the processes are ``compatible''
(meaning that their communication behavior matches). Channels are
assumed to be reliable. Sect. 6 contains the first derivation rule
of weaker safety properties that I know of. A bounded buffer is
taken as an example. The future work section discusses general
liveness properties and states that they are difficult to prove!
The basic fault-tolerance methodology involved here is based on
detection (which is assumed to be automatic) and correction through
recovery actions. This is the basis of later work in this direction
\cite{Peled:1994:CFF,Arora:1998:CDM,Arora:1998:DCT}."
}
@Article{Mattern:1987:ADT,
author = {Friedemann Mattern},
title = {Algorithms for distributed termination detection},
journal = j-DC,
year = {1987},
OPTkey = {},
volume = {2},
number = {3},
pages = {161-175},
OPTmonth = {},
OPTnote = {},
annote = {[to get]}
}
@Article{Moran:1987:EIR,
author = "Shlomo Moran and Yaron Wolfstahl",
title = "Extended impossibility results for asynchronous
complete networks",
journal = "Information Processing Letters",
volume = "26",
number = "3",
pages = "145--151",
day = "23",
month = nov,
year = "1987",
affiliationaddress = "Technion Israel Inst of Technology, Haifa, Isr",
journalabr = "Inf Process Lett",
keywords = "asynchronous complete networks; computational
complexity; computer fault tolerance; computer systems,
digital; consensus problem; consensus task; decision
graph; Distributed; distributed computation;
distributed processing; fault tolerance; fault tolerant
computing; graph theory; impossibility results;
mathematical techniques --- Graph Theory; protocol;
protocols; reliability; standardization; theory;
unsolvability; verification",
annote = "[to read]"
}
@Book{Raynal:1987:NDC,
author = "Michel Raynal",
title = "Networks and Distributed Computation: Concepts, Tools,
and Algorithms",
publisher = "North Oxford Academic Publishers",
address = "London",
year = "1987",
keywords = "book, text, parallel processing, supercomputers,",
ISBN = "0-946536-27-9",
note = "Original French language edition Systemes repartis
et reseaux (1987), translated by Meg Sanders",
}
@Article{Srikanth:1987:OCS,
author = "T. K. Srikanth and Sam Toueg",
title = "Optimal Clock Synchronization",
journal = J-ACM,
volume = "34",
number = "3",
pages = "626--645",
month = jul,
year = "1987",
url = "http://www.acm.org/pubs/toc/Abstracts/0004-5411/28876.html",
abstract = "We present a simple, efficient, and unified solution
to the problems of synchronizing, initializing, and
integrating clocks for systems with different types of
failures: crash, omission, and arbitrary failures with
and without message authentication. This is the first
known solution that achieves optimal accuracy - the
accuracy of synchronized clocks (with respect to real
time) is as good as that specified for the underlying
hardware clocks. The solution is also optimal with
respect to the number of faulty processes that can be
tolerated to achieve this accuracy.",
keywords = "algorithms; Byzantine failures; computer programming
--- Algorithms; computer systems, digital; Distributed;
message authentication; optimal clock synchronization;
reliability; synchronizing in presence of faults;
theory; verification",
annote = "Assumes that drist rate is bounded on processes and
that there is a maximum message delivery delay. Gives tolerance
specification of clock synchronization. Shows lower bound on
accuracy dependent on the drift rate of clocks: the bound on the
drift rate of logical clocks is at least as large as the bound of
drift of the physical clocks (Theorem 2). They present an algorithm
which reaches this bound."
}
@Article{Apt:1988:AFL,
title = "Appraising Fairness in Languages for Distributed
Programming",
author = "Krzysztof R. Apt and Nissim Francez and Shmuel Katz",
journal = "Distributed Computing",
pages = "226--241",
year = "1988",
volume = "2",
number = "4",
annote = "A general formulation of fairness is: if a certain choice
is possible infinitely often, then it is sufficiently often taken.
Precise formulations depend on how `choice', `possible' and
`sufficiently often' are defined. The authors propose three basic
criteria which any sensible definition of fairness should have in
any model: feasibility, equivalence robustness, and liveness
enhancement. (a) Fairness usually rules out certain traces which would
be acceptable in the given model of computation. Feasibility
ensures that after ruling out unfair traces, still valid traces
remain. More precisely, feasibility requires that for every point
in a computation it should be possible to extend it to be a fair
one. This is related to the notion of machine closure
\cite{Lamport:2000:FAH}. (b) Equivalence robustness means that
if a trace x is fair, then a trace y must also be fair where y
results from x by resorting `independent' actions. (c) Liveness
enhancement means that all distributed system models assume a
fundamental liveness property, meaning for example that eventually
the system will take a step if it is not deadlocked. A fairness
definition must give `additional value' to such an assumption,
i.e., there must be a program which has a liveness property only
if the additional fairness requirement holds."
}
@Article{Arora:1988:MCD,
author = {Rada Krishan Arora and M. N. Gupta},
title = {More comments on {``Distributed termination detection
algorithm for distributed computations''} ({Letter} to
the {Editor})},
journal = ipl,
year = 1988,
OPTkey = {},
volume = 29,
OPTnumber = {},
month = {September},
pages = {53--55},
annote = "See also \cite{Arora:1986:DTD,Tan:1986:CDT}. Tries to fix
the error."
}
@Book{Chandy:1988:PPD,
author = "K. Mani Chandy and Jayadev Misra",
title = "Parallel Program Design: {A} Foundation",
publisher = pub-AW,
address = "Reading, Mass.",
year = "1988",
annote = "[to read]"
}
@Article{Dwork:1988:CPP,
author = "Cynthia Dwork and Nancy Lynch and Larry Stockmeyer",
title = "Consensus in the presence of partial synchrony",
OPTcrossref = "",
OPTkey = "",
journal = j-ACM,
year = "1988",
volume = "35",
number = "2",
pages = "288--323",
month = apr,
OPTnote = "",
annote = "The authors study practically motivated models of
synchrony that lie between fully asynchronous and
fully synchronous systems in which consensus shall
be achieved. The models of partial synchrony studied
include: (1) upper bounds on processor speeds and
message latency exist but are unknown, and (2) upper
bounds exists and are known, but only hold after
some unknown time (eventually). In both cases
consensus with different resiliency can be achieved."
}
@InProceedings{Fidge:1988:TMP,
author = {Colin J. Fidge},
title = {Timestamps in message-passing systems that preserve
partial ordering},
booktitle = {Proceedings of the 11th Australian Computer Science
Conference},
OPTcrossref = {},
OPTkey = {},
pages = {56--66},
year = {1988},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
month = feb,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
OPTannote = {}
}
@InProceedings{Haban:1988:GEG,
author = "Dieter Haban and Wolfgang Weigel",
title = "Global Events and Global Breakpoints in Distributed
Systems",
booktitle = "Proceedings of the Twenty-First Annual Hawaii
International Conference on System Sciences",
year = "1988",
month = jan,
pages = "166--175",
editor = "Bruce D. Schriver",
volume = "II (Software Track)",
publisher = pub-IEEE,
annote = "[to read]"
}
@InProceedings{Herlihy:1988:RAO,
author = "Maurice P. Herlihy and Jeannette M. Wing",
title = "Reasoning about Atomic Objects",
pages = "193--208",
ISBN = "3-540-50302-1",
editor = "M. Joseph",
booktitle = "Proceedings of the Symposium on Formal Techniques in
Real-Time and Fault-Tolerant Systems",
month = sep,
series = ser-LNCS,
volume = "331",
publisher = pub-SV,
year = "1988",
annote = "formal proof method for fault tolerant programs, to read"
}
@Book{Isermann:1988:DRS,
author = {Rolf Isermann},
title = {{Digitale Regelsysteme, Band I (in German)}},
publisher = pub-SV,
year = {1988},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {[to read]}
}
@ARTICLE{Kessels:1988:EPS,
AUTHOR = "J. L. W. Kessels",
TITLE = "An exercise in proving self-stabilization with a
variant function",
JOURNAL = j-IPL,
VOLUME = 29,
YEAR = 1988,
PAGES = "39--42",
annote = "Correctness proof of Dijkstra's 3-state mutual
exclusion protocol \cite{Dijkstra:1974:SSS} using a
bound function. It shows the general technique of
proving convergence by a variant function and also
exposes the intrinsic dangers and difficulties of
this method."
}
@Article{Knuth:1988:SDS,
author = {T. Knuth},
title = {{Schadenfr\"uherkennung durch Schwingungsanalysen --- Neue
M\"oglichkeiten in der Instandhaltung}},
journal = {Der Maschinenschaden},
year = {1988},
OPTkey = {},
volume = {61},
OPTnumber = {},
pages = {70--74},
OPTmonth = {},
OPTnote = {},
annote = {[Angabe von Armin]}
}
@InCollection{Mancini:1988:TTR,
author = "Luigi V. Mancini and Guiseppe Pappalardo",
editor = "M. J. Warick",
title = "Towards a theory of replicated processing",
booktitle = "Formal techniques in real-time and fault-tolerant
systems",
series = ser-LNCS,
volume = "331",
publisher = pub-SV,
year = "1988",
annote = "specification approach [to read]"
}
@InProceedings{Mattern:1988:VTG-CITE-1989-VERSION,
author = {Friedemann Mattern},
title = {Virtual time and global states of distributed systems},
booktitle = {Proceedings of the International Workshop on Parallel
and Distributed Algorithms},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {1988},
editor = {M. Cosnard},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Chateau de Bonas, France},
OPTmonth = oct,
OPTorganization = {},
publisher = {Elsevier},
OPTnote = {},
annote = {Reprinted somewhere, but where?}
}
@InProceedings{Miller:1988:BHD,
author = "Barton P. Miller and Jong-Deok Choi",
title = "Breakpoints and halting in distributed programs",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "316--323",
booktitle = "Proceedings of the 8th International Conference on
Distributed Computing Systems",
year = "1988",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
OPTmonth = "",
OPTnote = "",
annote = "The authors address the problem of distributed
debugging by formally defining distributed
predicates and giving algorithms to detect such
predicates and to halt a distributed computation in
a consistent state once the predicate is
satisfied. Predicates are either (1) simple
predicates (defined over the state of a single
process), (2) disjunctions of simple predicates, (3)
linked predicates (a sequence of events ordered by
the causality relation) and (4) conjunctions of
simple predicates. Types (1) and (2) are easily
detectable by detection modules within
processes. Type (3) is detected by tracking event
occurrences on the processes involved in each item
on the predicate chain and following causal
dependencies across channels with markers. Type (4)
needs a definition of its semantics since there is
no single notion of time: $A\land B$ is true
whenever $A$ becomes true on one process and $B$
subsequently becomes true on another process and $B$
causally depends on $A$. If no causal relationship
exists between $A$ and $B$, then a central observer
is used to detect $A\land B$. An algorithm to detect
these predicates and to halt the algorithm is given
based on the Chandy/Lamport algorithm to observe
computations (requires FIFO channels, but
asynchronous system). The paper describes problems
related to distributed debugging. The detection of
conjunctions implicitly defines ``possibly($A\land
B$)'' without stating how to detect this in all
cases in a distributed fashion. But this is okay,
since this paper rather aims at detecting dynamic
properties, and possibly is a static property. See
also a good continuation of this work by Babaoglu et
al. \cite{Babaoglu:1996:UFS}."
}
@InProceedings{Patterson:1988:CRA,
author = "David A. Patterson and Garth Gibson and Randy H.
Katz",
title = "{A} {C}ase for {R}edundant {A}rrays of {I}nexpensive
{D}isks ({RAID})",
booktitle = "Proceedings of the ACM Conference on Management
of Data (SIGMOD)",
year = "1988",
month = jun,
OPTaddress = "Chicago, IL",
pages = "109--116",
abstract = "As processor and memory speeds increase at an
exponential rate and single disk access times remain
relatively constant, it is apparent that I/O bandwidth
is likely to become a bottleneck in the performance of
systems. One way to address this problem is by using
disk arrays, i.e., sets of relatively inexpensive disks
which can improve I/O bandwidth via parallel access.
The problem with this approach is that simply using
disk arrays can drastically reduce reliability. The
approach of RAID is to use redundant disks of check
data to bring reliability up to acceptable levels
(i.e., failure rates better than expected useful life
of the disks). Five levels of the RAID design are
presented to address the issues of overhead cost (in
terms of number of disks), useable storage capacity,
and efficiency per disk for various read and write
scenarios (i.e, large vs. small). These issues were
considered in terms of {\em data rates} (supercomputer
applications) and {\em I/O rates} (transaction
systems). Level 5 RAID provides the best all around
performance by distributing check data across the data
disks to increase parallelism.",
annote = "to read"
}
@Book{Raynal:1988:DAP,
author = "Michel Raynal",
title = "Distributed Algorithms and Protocols",
series = "Wiley Series in Computing",
pages = "163",
publisher = "John Wiley \& Sons",
address = "Chichester, England",
year = "1988",
keywords = "book, text, parallel processing, supercomputers,
electronic data processing -- distributed processing,
algorithms, computer network protocols",
ISBN = "0-471-91754-0",
abstract = "More theoretical book on the fundamental problems in
distributed systems and some solutions. 1st. English
issue 1988 (the French version was published in
1985).\par ** Description ** The use of distributed
algorithms offers the prospect of great advances in
computing speed. This book provides a clear, practical,
and up-to-date guide to distributed algorithms and
protocols in the area of control. Much of the material
has been heretofore unavailable in English. Each
chapter considers a specific aspect of control, with an
analysis of the problem, a description of the algorithm
for solving it, and proofs of correctness. Chapters can
be studied independently to find solutions to
particular problems.\par ** Contents ** Introduction to
Distributed Algorithms. Election and Mutual Exclusion
Algorithms. Algorithms for Detection and Resolution of
Deadlock. Algorithms for Detecting Termination.
Protocols for Data Transfer. Management of Distributed
Data. Problems of Gaining Concensus in the Presence of
Uncertainties (or How to Avoid Byzantine Quarrels).
References.",
note = "Algorithmes distribues et protocoles, translated by
Jack Howlett",
}
@InProceedings{Abadi:1989:RUS,
author = {Mart{\'\i}n Abadi and Leslie Lamport and Pierre Wolper},
title = {Realizable and unrealizable specifications of
reactive systems},
booktitle = {Automata, Languages and Programming. 16th
Int.~Colloquium Proceedings},
OPTcrossref = {},
OPTkey = {},
ages = {1--17},
year = {1989},
editor = {G. Ausiello and M. Dezani-Ciancaglini and S. Ronchi
Della Rocca},
OPTvolume = {},
number = {372},
series = ser-LNCS,
address = {Stresa, Italy},
month = jul,
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {A specification is a formula $E\Rightarrow M$ where $E$ is
an assumption about the environment and $M$ is a property
guaranteed by the system (this way of viewing specifications is
described in
\cite{Lamport:1989:SAS,Abadi:1993:CS}). Specifications can become
unrealizable if $E$ asserts some property of the environment
because this part of the universe is totally outside the control
of the implementor. Thus, a specification is unrealizable if it
constrains the environment. This paper studies the exact
definitions and conditions of realizability. The first approach is
to define a simple computer and base the definition of
realizability on the fact that a specification can be implemented
on such a device. On the other hand, it views a specification as
the rules for a two-player infinite game where environment and
system both take turns and try to win. The environment wins if it
can produce unspecified executions. Otherwise the system wins. It
turns out that a specification is realizable if the system has a
winning strategy. Realizability of a specification is a different
notion than consistency (i.e., whether the set of infinite
behaviors of the system is nonempty). This paper is very
theoretical and uses a lot of terminology and concepts that I am
not familiar with (B\"uchi automata, Borel sets, etc). The ideas
of game-playing and specifications appear again in
\cite{Abadi:1993:CS} in an at least to me more understandable
fashion. }
}
@InProceedings{Chor:1989:RBA,
author = "Benny Chor and Cynthia Dwork",
title = "Randomization in {Byzantine} Agreement",
booktitle = "Advances in Computing Research 5: Randomness and
Computation",
publisher = "JAI Press",
year = "1989",
pages = "443--497",
OPTnote = "A useful survey of the myriad of randomized
distributed algorithms for Byzantine agreement.",
annote = "[to read]"
}
@Article{Cristian:1989:PCS,
author = {Flaviu Cristian},
title = {Probabilistic clock synchronization},
journal = j-DC,
year = {1989},
OPTkey = {},
volume = {3},
OPTnumber = {},
pages = {146--158},
OPTmonth = {},
OPTnote = {},
annote = {A very well-written introduction into clock synchronization
in ``real'' systems, and a good starting point for a lecture on this
topic. Assumes that there is a maximum drift rate but there is no
maximum message delivery delay. In this setting, clock synchronization
can only be achieved in a probabilistic manner and Cristian well explains
the inherent tradeoffs. Mentions that modern quartz clocks have a
drift rate of the order $10^{-6}$, messages have some minimum time
to travel but the distribution of delivery times (while usually being
close to the minimum) is arbitrary. Nodes and messages can only suffer
performance failures. In a nice exposition, it is explained how
a node reads another node's clock and within what bounds the reading
is as well as the error. (The precision of the reading is better the
shorter the round trip time of the reading was.) Fixing the error
results in a maximum time which a node is willing to wait for a
result. There is a fundamental trafe-off between the precision of
the reading and the probability of success. Other algorithms
like \cite{Srikanth:1987:OCS,Dolev:1995:DFC} are deterministic,
i.e. they always reach a result but have poor precision. There
is a continuum of probabilistic algothms between the bounds of
setting the maximum waiting time. Setting it close to the
minimum is ``aggressive'' and will get good results with low
probability. The other extreme are deterministic algorithms. Cristian
also sketches implementations of time services and gives real-world
numbers to instantiate the formulas given. It shows that
synchronization within milliseconds is achievable. Overall,
one of my top-ten favourite papers. }
}
@InProceedings{Gopal:1989:RBS,
author = "A. Gopal and S. Toueg",
title = "Reliable Broadcast in Synchronous and Asynchronous
Environments",
booktitle = pro-wdag89,
address = "Nice, France",
year = "1989",
pages = "110--123",
annote = "[to read]"
}
@InProceedings{Gray:1989:LEF,
author = "Cary G. Gray and David R. Cheriton",
title = "Leases: {An} efficient fault-tolerant mechanism for
distributed file cache consistency",
booktitle = "Proceedings of the 12th ACM Symposium on Operating
System Principles",
conflocation = "Litchfield Park, AZ, 3--6 December 1989",
journal = "Operating Systems Review",
volume = "23",
number = "5",
year = "1989",
month = dec,
pages = "202--10",
key = "Gray89",
keywords = "Gray89 time-based distributed coherency, distributed
file sytems, V performance, lease",
abstract = "Caching introduces the overhead and complexity of
ensuring consistency, reducing some of its performance
benefits. In a distributed system, caching must deal
with the additional complications of communication and
host failures. {\em Leases} are proposed as a
time-based mechanism that provides efficient consistent
access to cached data in distributed systems.
Non-Byzantine failures affect performance, not
correctness, with their effect minimized by short
leases. An analytic model and an evaluation for file
access in the V system show that leases of short
duration provide good performance. The impact of leases
on performance grows more significant in systems of
larger scale and higher processor performance.",
annote = "A lease is a contract that gives its holder specified
rights over an object for a limited period of time. In the case
where file cache consistency is to be maintained, a cache must
obtain a lease for an object when the application accesses that
object. A lease implicitly contains a lease term (duration) which
describes its validity over time. Only with a valid lease a cache
is allowed to answer read requests for that object. If the cache
is requested to update an object, the cache must obtain a lease
(if it doesn't have one already) and must then obtain approval by
all other leaseholders for the write. When granting approval,
leaseholders give up their lease. Here, fault tolerance comes
into play: a client wanting to update an object must wait either
until it has an approval of all leaseholders or until all of
their leases have expired. (To prevent starvation, no new leases
for an object are granted during this waiting time.) This can
effectively help combat non-Byzantine faults in the system. Leases
can introduce \emph{false sharing}, i.e. lease conflicts where no
actual write conflicts exist, for example if another client cache
has obtained a lease but has stopped using the object long before
the lease has expired. For this, short lease terms are good.
Short lease terms also minimize the delay caused by network
partitions and client crashes (this is analogous to short
aggressive time-outs in failure detection). Long term leases
have the advantage if objects are accesses repeatedly by the
same client and there is little write sharing. Analytical and
experimental results are presented, stating that lease terms
of 5--10 seconds in the V system are quite good, based on
read and write rates between 0.03 (writes) and 0.8 (reads)
per second, message propagation of 1 msec, message processing
time of 0.25 msec and maximum clock skew of 100 msec. These
simulations however do not refer to fault tolerance issues.
The leases mechanism is dependent on synchronized clocks. A
minimum assumption is that clocks have a known bounded drift rate.
In this case, leases can be simply communicated using their
duration. Server clocks that advance too quickly and client
clocks which are too slow are problematic and can cause errors
while the opposite (e.g. slow server clocks etc.) simply cause
more message traffic. The conclusions contain a good cite which
is in the spirit of Cristian and Fetzer's timed model
\cite{Cristian:1999:TAD}: ``The lease approach is an example of a
communication and coordination mechanism and reasoning based on
(real) time, the availability of clocks that measure the passage
of time with modest accuracy, and the ability to draw conclusions
after a passage of time, possibly in the absence of communication.
[...] We see this use of time as a fundamental aspect of distributed
systems with potential for significant extension beyond that
described here.'' "
}
@Article{Halpern:1989:MKA,
author = {Joseph Y. Halpern and Ronald Fagin},
title = {Modellung knowledge and action in distributed systems},
journal = j-DC,
year = {1989},
OPTkey = {},
volume = {3},
OPTnumber = {},
OPTmonth = {},
pages = {159--177},
OPTnote = {},
OPTannote = "[to do]"
}
@Article{Lamport:1989:SAS,
author = {Leslie Lamport},
title = {A simple approach to specifying concurrent systems},
journal = j-CACM,
year = {1989},
OPTkey = {},
volume = {32},
number = {1},
pages = {32--45},
month = jan,
OPTnote = {},
annote = {An amusing but still challenging paper on formal
specifications of concurrent programs. Lamport informally presents
the ``transition axiom method'' which is described in detail in
\cite{Lamport:1983:SCP}. A system is a `thing' that interacts with
its environment through a well-defined interface. The system
properties in question are described as safety and liveness, which
capture the essence of system behavior relevant to the
author. (There are system properties not expressible as safety and
liveness, some are given, confer also \cite{Rushby:1994:CSP}.)
Safety properties are discussed first: A simple soda vending
machine with three (specification) states and four (specification)
state transitions is taken as an example. The essence of Lamports
specification method is to say which state transitions are allowed
and which ones aren't. A system may have some unspecified state
set $S$, and a specification can be viewed as a restriction on
some state function $f$ from $S$ to the set of specification
states. The machines behavior is a sequence of states
$s_0,s_1,\ldots$ from $S$. A programmer wishing to implement the
specification must find such a state function $f$ which changes
its state according to the specification and some interface
actions. Finding such a function is like proving that the
implementation is correct regarding the specification. A
specification must also always contain a description of the
interface of the system in question. This description is naturally
at an implementation level. The formula underlying a transition
axiom specification is a temporal logic formula of the form
$\exists f_1,\ldots,f_n$ for which
$X(f_1,\ldots,f_n,g_1,\ldots,g_m)$. Here, $f_i$ are internal state
functions and $g_i$ are state functions of the interface. The
existential quantification over $f_i$ signifies the freedom of
implementation. The fact that the $g_i$ are free variables means
that they must appear in the implementation (i.e., are in fact
implementation level). The internal states which are implied by a
transition axiom specification constrain the implementation a bit;
formalisms that do not constrain the implementation (like pure
temporal logic) are however not more general that transition
axioms. In fact, sometimes it's good to give some hints to an
implementation. (However, a specification still should concentrate
only on the externally visible behavior. Mechanisms not using
additional state variables tend to be very complex.) The approach
to write specifications then is to (1) choose a set of states (and
thus state functions), (2) specify how they are allowed to change
(these are the transition axioms), and (3) specify when they must
change. Transition axioms are safety requirements, part 3
specifies liveness requirements. Liveness requirements are written
in temporal logik. A specification can be separated into safety
and liveness, thus separating the transition axioms from the
temporal logik part. Showing that an implementation satisfies a
specification, one shows that the system's safety implies the
safety specification and then that the system's safety and
liveness imply the liveness specification. The system's safety and
liveness are given by the implementation, which is a kind of lower
level specification. The paper is written in a question/answer
style which is very amusing. A rewarding paper.}
}
@InProceedings{Mattern:1989:VTG,
author = "Friedemann Mattern",
title = "Virtual time and global states of distributed
systems",
booktitle = "Proceedings of the International Workshop on Parallel
and Distributed Algorithms",
editor = "M. Cosnard et al.",
publisher = "Elsevier Science Publishers",
address = "Chateau de Bonas, France",
year = "1989",
pages = "215--226",
note = "Reprinted on pages 123--133 in \cite{Yang:1994:GST}.",
annote = "Classic on vector time, consistent global states etc."
}
@PhdThesis{Michel:1989:KDB,
author = "Ruben Michel",
title = "Knowledge in distributed {B}yzantine environments",
school = "Yale University",
year = "1989",
annote = "requested from yale tech reports"
}
@Misc{Mills:1989:MPN,
OPTkey = {},
author = {David L. Mills},
title = {Measured performance of the Network Time Protocol in
the Internet system},
howpublished = {Internet Request for Comments RFC 1128},
year = {1989},
month = oct,
OPTnote = {},
OPTannote = {to read}
}
@Article{Rabin:1989:EDI,
author = {Michael O. Rabin},
title = {Efficient dispersal of information for security, load
balancing, and fault tolerance},
journal = {Journal of the ACM},
year = {1989},
OPTkey = {},
volume = {36},
number = {2},
pages = {335--348},
OPTmonth = apr,
OPTnote = {},
annote = {This is maybe a relation between security, fault
tolerance and redundancy? Uses a scheme of information sharing
to make information secure and available.}
}
@Book{Rao:1989:ECC,
author = {T. R. N. Rao and E. Fujiwara},
title = {Error-control coding for computer systems},
publisher = {Prentice-Hall},
year = {1989},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {My standard cite for error control and detection codes.}
}
@Article{Venkatesan:1989:RPD,
author = {Subbarayan Venkatesan},
title = {Reliable protocols for distributed termination detection},
journal = {IEEE Transactions on Reliability},
year = 1989,
OPTkey = {},
volume = 38,
number = 1,
pages = {103--110},
month = apr,
OPTnote = {},
annote = {Venky's Homepage: \url{http://www.utdallas.edu/~venky/}
The paper looks at distributed termination detection in asynchronous
systems with crash failures. It assumes that with $k$ failures the
network stays connected and that channels are FIFO. States that
termination detection is at least as hard as consensus and thus impossible
it the given context, so it assumes the what we today call a perfect
failure detector. The presented protocol is based on a termination
detection scheme built for fault free systems. If there can be $k$
failures, the protocol elects $k+1$ leaders which replicate the
state information of all other nodes. In case of a failure, the
termination detection protocol is aborted and a new round is started.
In this round, the leaders simulate the behavior of the crashed nodes.
Refernces a fault tolerant snapshot protocol by Shah and Toueg
which seems to be only available as a Cornell TR \cite{Shah:1984:DSS}.}
}
@InProceedings{Weber:1989:FSF,
author = "D. G. Weber",
title = "Formal Specification of Fault-Tolerance ad Its
Relation to Computer Security",
pages = "273--277",
ISBN = "0-8186-1942-2",
editor = "Sol Greenspan",
booktitle = "Proceedings of the 5th International Workshop on
Software Specification and Design",
address = "Pittsburgh, PA",
month = may,
year = "1989",
publisher = "IEEE Computer Society Press",
annote = "A neat and high level description of how fault-tolerance
in its different forms can be specified at the system interface. A
system is identified with its set of traces. A fault scenario is a
precise description of how the components are doomed to fail (this
is nowadays called the fault assumption). MTTF can be calculated by
averaging over all fault scenarios. A system $D$ has a
fault-tolerant version $FTD$, and let $N$ be a set of fault
scenarios where no faults occur (fault-free fault assumption) and
$C$ be a set of fault scenarios under which we desire fault
tolerance. Proving fault tolerance can be now done in three ways:
(1) show that the behavior of $D$ under $N$ is identical to the
behavior of $FTD$ under $C$, (2) characterize the behavior of $D$
under $N$ by some specification $S$ and show that $FTD$ under $C$
implements $S$, or (3) show that the behavior of $FTD$ under $N$ is
identical to the behavior of $FTD$ under $C$. The third method is
taken as the basis for a definition of fault tolerance: A system is
fault tolerant if for all its behaviors under $C$ there is an
equivalent behavior under $N$. This definition can be weakened by
redefining `equivalent' to mean `acceptably equivalent' regarding
some equivalence relation on traces. This can be also model
gracefull degradation (as is done in \cite{Herlihy:1991:SGD}). The
author indicates that there are close resemblances to computer
security specifications: highly sensitive events are analogous to
faults as they should not show up on lower levels (i.e. to
unauthorized users). Overall a short and concise paper, one of the
earliest using this formal view, although I don't understand the
differences between (1), (2) and (3) above. Referenced in
\cite{Schepers:1993:TFT} as a similar approach as
\cite{Joseph:1987:PRF} (expliziter Fehlermodellierung). Generally,
security properties are probably higher oder properties
cite{McLean:1994:GTC}. Not cited in \cite{Herlihy:1991:SGD}."
}
@Article{Ben-Or:1990:FPS,
author = "Michael {Ben-Or} and Oded Goldreich and Silvio Micali
and Ronald L. Rivest",
title = "A Fair Protocol for Signing Contracts",
number = "1",
journal = "IEEE Transactions on Information Theory",
volume = "36",
pages = "40--46",
year = "1990",
month = jan,
annote = "The authors present a neat fair exchange protocol which
works as follows: two parties $A$ and $B$ exchange in rounds signed
statements of the form ``with probability $p$ the agreed-upon
contract is valid for me'' ($p$ is different for messages signed by
$A$ or by $B$). Both parties start with $p=0$ and independently
decide how to increase their $p$. In the effective case, eventually
both will receive a statement of the form ``with probability 1 the
contract is valid for me''. In the non-effective case, one party
(say $A$) can turn to a judge and present to it the last message it
received from $B$. The judge throws a dice and decides with
probability $p_B$ whether the contract holds for both or not. If it
holds, $B$ must obey the contract too. If it does not hold, the
contract is refuted. The judge must be able to recollect every
verdict (and thus usually store the value together with the contract
[a method is given how this can be circumvented]). Overall, this is
a very interesting and well-written paper keeping mathematics
small. The protocol can be seen as a very general and clever gradual
exchange protocol which can also be applied if the to be exchanged
item is not infinitely splittable. It is optimistic since the judge
is only needed in failed cases. The paper is also interesting since
it reviews some fairness definitions regarding gradual exchange
(computational vs. probabilistic) and thus comes close to the
formalization of strong fairness of \cite{Gaertner:1999:AFD}. Also,
an impossibility result concerning two-party exchange is cited
cite{Even:1980:RAP} which is difficult to get, but relevant for
\cite{Pagnia:1999:IFE}."
}
@Article{Biran:1990:CCD,
title = "A Combinatorial Characterization of the Distributed
1-Solvable Tasks",
author = "Ofer Biran and Shlomo Moran and Shmuel Zaks",
pages = "420--440",
journal = "Journal of Algorithms",
year = "1990",
month = sep,
volume = "11",
number = "3",
annote = "[to read] extends \cite{Fischer:1985:IDC}."
}
@Article{Buerk:1990:VES,
author = "Holger B{\"u}rk and Andreas Pfitzmann",
title = "Value Exchange Systems Enabling Security and
Unobservability",
keywords = "digital money, TTP, payment, pseudonyms, ware,
complaint period",
journal = "Computers \& Security",
year = "1990",
annote = "[havent got a copy, annote written by someone else:]
two approaches to overcome the problem of simultaneity
in value exchange. both based on digital signatures
(pseudonyms/one-show credentials) certified by TTP: 1.)
passive TTP: - mutual authentication using pseudonyms X
<-> Y - signing of agreement X <-> Y - money X -> Y -
ware Y -> X, or complaint X -> TTP, TTP checks
agreement, asks Y to deliver again and passes ware to X
or identifies Y (-> court) 2.) active (intermediary)
TTP: - X,Y,TTP make agreement (to protect from TTP) -
money X -> TTP (as money can not be copied this has to
be done before ware to protect from faulty TTP) - ware
Y -> TTP (after receiving ``money-commit'') - money ->
Y, ware -> X (after check of ware) abortion after
money-transfer requires signed cancelation by TTP
and/or prove of payment. how handling
interactive/non-transferable payments. question of
quality of service have to solved outside the system by
a court. (good paper, not too formal)",
number = "8",
pages = "715--721",
volume = "9",
}
@Article{Champine:1990:PAD,
author = "George A. Champine and Daniel E. {Geer, Jr.} and
William N. Ruh",
title = "{Project Athena} as a Distributed Computer System",
journal = j-IEEE-COMPUTER,
volume = "23",
number = "9",
pages = "40--51",
month = sep,
year = "1990",
abstract = "Now providing 10,000 students and faculty with a
variety of network services, MIT's educational
workstation system is designed to grow to 10 times its
present size.",
annote = "[to read]"
}
@InProceedings{Chaudhuri:1990:AHC,
author = "Soma Chaudhuri",
title = "Agreement is harder than consensus: set consensus
problems in totally asynchronous systems",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
OPTpages = "311--324",
booktitle = "Proceedings of Principles of Distributed Computing 1990",
year = "1990",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
OPTmonth = "",
OPTnote = "",
annote = "The author investigates the boundary between
possibility and impossibility of solutions to
problems in asynchronous systems. The problems
investigated are $k$-set consensus problems, where
the agreed upon set of values has size at most
$k$. It is shown that the $m$-resiliency is in
relation to the size $k$ of the set. This is another
paper exploring the border to impossibility after
the FLP result \cite{Fischer:1985:IDC} in the lines
of \cite{Attiya:1987:ACA,Dolev:1987:MSN} and
\cite{Dwork:1988:CPP}. A subsequent version appeared
in Information and Computation, 105 (1), 1993,
pp. 132--158."
}
@Article{Dwork:1990:KCK,
author = "Cynthia Dwork and Yoram Moses",
title = "Knowledge and Common Knowledge in a {B}yzantine
Environment: Crash Failures",
journal = "Information and Computation",
year = "1990",
volume = "88",
number = "2",
pages = "156--186",
topic = "epistemic-logic;mutual-belief;Byzantine-agreement;",
annote = "[to read]"
}
@InCollection{Emerson:1990:TML,
author = {E. Allen Emerson},
title = {Temporal and Modal Logic},
booktitle = {Handbook of Theoretical Computer Science},
OPTcrossref = {},
OPTkey = {},
pages = {997--1072},
publisher = {Elsevier},
year = {1990},
editor = {Jan van Leeuwen},
volume = {B},
OPTnumber = {},
OPTseries = {},
OPTtype = {},
chapter = {16},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {Brilliant introduction into the zoo of temporal logics.}
}
@InProceedings{Gopal:1990:SFB,
author = "A. Gopal and S. Toueg",
title = "On the Specification of Fault-Tolerant Broadcast",
booktitle = "Proc. Int. Workshop on Future Trends of Distributed
Computing Systems",
pages = "54--56",
publisher = "IEEE Computer Society Press",
address = "Cairo, Egypt",
year = "1990",
annote = "[to read]"
}
@ARTICLE{Gouda:1990:SU,
AUTHOR = "Mohamed G. Gouda and Ted Herman",
TITLE = "Stabilizing unison",
JOURNAL = j-IPL,
VOLUME = 35,
YEAR = 1990,
PAGES = "171--175",
annote = "A short paper in the lines of \cite{Arora:1991:MDC}."
}
@InProceedings{Gronning:1990:SDD,
title = "Stepwise Development of a Distributed Load Balancing
Algorithms",
author = "Peter Gr{\o}nning and Thomas Qvist Nielsen and Hans
Henrik L{\o}vengreen",
booktitle = pro-wdag90,
editor = "Jan van Leeuwen and Nicola Santoro",
year = "1990",
series = ser-LNCS,
volume = "484",
ISBN = "ISBN 3-540-54099-7",
pages = "151--168",
annote = "Abstract problem statement like in
\cite{Arora:1995:ECC}. Formal definition of globally
$k$-balanced, locally $k$-balanced. Resulting system is
only locally balanced by simple local exchanges of
one load unit at a time. No global balancing wanted,
but a very broad sense of global balance achieved
(depending on the diameter of the network). The
abstract algorithm is transformed into a message passing
environment. Explicit reference to
self-stabilization of Dijkstra and hints to papers
on stepwise development out of specifications."
}
@InProceedings{Halpern:1990:CEB,
author = "Joseph Halpern and Yoram Moses and Orli Waarts",
title = "A Characterization of Eventual Byzantine Agreement",
pages = "333--346",
ISBN = "0-89791-404-X",
editor = "Cynthia Dwork",
booktitle = "Proceedings of the 9th Annual {ACM} Symposium on
Principles of Distribted Computing",
address = "Qu{\'e}bec City, Qu{\'e}bec, Canada",
month = aug,
year = "1990",
publisher = "ACM Press",
annote = "The authors show that while common knowledge is
sufficient to achieve simultaneous Byzantine
agreement, eventual Byzantine agreement (EBA) is
equivalent to achieving continual common knowledge,
a variant of common knowledge. They give a brief
introduction into the knowledge formalism, define
and characterize continual common knowledge and show
how to construct optimal EBA protocols with a
certain technique. The fault class under
consideration comprises omission and crash
faults. The conclusions state that results can be
extended to Byzantine faults, asynchronous systems
and general coordination problems. Overall a very
concise and brief-up-to-the-bare-minimum paper."
}
@Article{Halpern:1990:KCK,
author = "Joseph Y. Halpern and Yoram Moses",
title = "Knowledge and common knowledge in a distributed
environment",
OPTcrossref = "",
OPTkey = "",
journal = j-ACM,
year = "1990",
volume = "37",
number = "3",
pages = "549--587",
month = jul,
OPTnote = "",
annote = "A brilliant paper on the role of ``knowledge'' in
distributed systems. The authors define different
notions of knowledge (as opposed to belief) and
emphasize the small differences by amusing and
instructive examples. The different notions of
knowledge are: distributed knowledge of x of a group
G (someone who knows everything that people in G
know knows x), ``someone'' knowledge, ``everyone''
knowledge, and common knowledge. These form a strict
hierarchy. Different forms of knowledge can be
ascribed to processors, the most common being
view-based knowledge. View-based knowledge of
processor p of a fact f means that f is true in all
points that are indistinguishable by p. Normally,
view-based knowledge bases on the state (or state
history) of a node. Common knowledge is the
strongest form and the authors show that it is at
the core to a lot of important problems in
distributed systems (for example agreement). The use
the coordinated attack problem (or two way
handshake) to show, that common knowledge is not
attainable in systems with unreliable (or completely
asynchronous) message passing and without a global
clock. In general, such communication cannot be used
to attain common knowledge. This is a direct
connection to the impossibility of consensus in
asynchronous systems \cite{Fischer:1985:IDC}. In
practice, many problems can only be solved because
the do not require common knowledge. But also: There
are certain weaker kinds of common knowledge that
are attainable. The first is epsilon-common
knowledge, where the fact of sending a message (and
that all others receive it) will become common
knowledge within epsilon time steps. (This is
analogous to a synchronous broadcast.) The second is
eventual common knowledge, where sending a message
will eventually become common knowledge. (This
corresponds to asynchronous but reliable
communication.) Eventual common knowledge is weaker
than epsilon common knowledge. Things that can not be
attained using reliable communication cannot be
attained too if communication is
unreliable. Finally, the notion of timestamped
common knowledge is discussed (``at time t on p's
clock p knows something''). Timestamped common
knowledge is aparent in many protocols that operate
in rounds. At the end, the notion of virtual
synchrony is connected to the notion of knowledge
consistency, where nodes may actually not have
common knowledge, but nothing they see violates this
assumption. The conclusions contain hints to other
research in the field. Overall, this is a paper with
a huge potential that seemingly hasn't been followed
in recent years."
}
@Article{Jalote:1990:FRW,
author = "P. Jalote and S. K. Tripathi",
title = "Final Report on Workshop on Integrated Approach for
Fault Tolerance - Current State and Future
Requirements",
journal = "ACM Operating Systems Review",
volume = "24",
number = "1",
pages = "40--57",
year = "1990",
annote = "[to read]"
}
@InProceedings{Jayanti:1990:WUR,
title = "Wakeup under Read/Write Atomicity",
author = "Prasad Jayanti and Sam Toueg",
booktitle = "Distributed Algorithms, 4th International Workshop",
editor = "Jan van Leeuwen and Nicola Santoro",
address = "Bari, Italy",
month = "24--26~" # sep,
year = "1990",
series = "Lecture Notes in Computer Science",
volume = "486",
publisher = pub-SV,
ISBN = "ISBN 3-540-54099-7",
pages = "277--288",
annote = "Ted says that here's a possible relation between
self-stabilization and unreliable failure detection. Have to get it."
}
@Book{Krumm:1990:FAK,
author = {Heiko Krumm},
title = {{Funktionale Analyse von Kommunikationsprotokollen}},
publisher = pub-SV,
year = {1990},
OPTkey = {},
OPTvolume = {},
number = {247},
series = {Informatik-Fachberichte},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {Krumm entwirft ein allgemeines Modell zur Beschreibung
funktionaler Aspekte von Kommunikationsprotokollen kooperierender
Systeme und gibt einen verdienstvollen Ueberblick ueber die
existierenden Spezifikationstechniken und deren Zusammenhaenge
untereinander. Die Grundbegriffe des Modells sind System, Kopplung
und Instanz. Ein System besteht aus einer Menge von Instanzen, die
intern ueber eine Kopplung kommunizieren. Das System selbst hat im
Falle eines offenen Systems eine Schnittstelle zu einer
Systemumwelt, und da Instanzen selbst wieder Systeme im Kleinen
sind, haben Instanzen auch eine Instanz-Schnittstelle. Diese
statische Grundstruktur erlaubt eine hierarchische
Systemdefinition. Systeme bzw. Instanzen koennen intern betrachtet
werden (d.h. ihr innerer Aufbau inklusive Subsystemen und
Kopplung) oder extern (d.h. nur anhand ihres Verhaltens an der
Schnittstelle). Eine Schnittstelle ist eine Menge von
Ereignissen. Das Kommunikationsverhalten an einer Schnittstelle
ist ein Baum, dessen Kanten mit derartigen Ereignissen bezeichnet
ist. Kommunikationsverhalten abstrahiert von internem Verhalten
eines Systems. Durch Betrachtung des Kommunikationsverhaltens ist
es moeglich, Instanzen bezueglich ihres Verhaltens zu
vergleichen. Kommunikation wird modelliert durch eine Kopplung,
die selbst ein System ohne interne Zustaende ist und atomar
beliebig viele Ereignisse (=Nachrichten) an der Schnittstelle
empfangen und Versenden kann. Kopplungen arbeiten nach dem
Uebereinkunftsprinzip (synchron) oder nach dem
Uebertragungsprinzip (asynchron). Systeme selbst werden auf der
Basis von Zustaenden und Zustandsuebergaengen definiert. Aus
dieser Definition entspricht der Menge aller Systemablaeufe ein
Erreichbarkeitsgraph, der eine endliche Repraesentation eines
prinzipiell unendlichen Verhaltens ist. Das
Schnittstellenverhalten (Kommunikationsverhalten) ist ein Baum,
der mit dem internen Systemablauf vertraeglich ist. Anschliessend
wird auf die Begriffe Dienst und Protokoll eingegangen. Ein
Protokoll ist ein internes Ablaufverhalten eines Systems, welches
vom globalen Verhalten abstrahiert und nur die vom Protokoll
reglementierten Kommunikationsbeziehungen betrachtet. Ein Dienst
ist eine Instanz, die an ihrer Schnittstelle ein gewisses
Verhalten (mit gewissen Ereignissen)
garantiert/anbietet. Protokolle sind darum horizontale
Kommunikationsbeziehungen waehrend Dienste vertikale Beziehungen
darstellen (bezogen auf die gebraeuchliche Darstellung des
ISO/OSI-Protokollstacks). Im folgenden Kapitel werden die
gaengigen Analysemassnahmen angesprochen (von informalen
Ueberpruefungen bis formalen Korrektheitsnachweisen) und der
Begriff der Eigenschaft eines Protokolls definiert. Anschliessend
werden gaengige Spezifikationstechniken klassifiziert nach
Spezifikationsform (konstruktiv = spezifiziere
Schnittstellenverhalten durch internes Verhalten, deskriptiv =
spezifiziere Schnittstellenverhalten durch direkte
Verhaltensbeschreibung an der Schnittstelle). Konstruktive
Techniken koennen direkt (konkreter Automat angegeben) oder
algebraisch (eine gewisse Abstraktion von internem
Automatenverhalten) sein. Deskriptive Techniken koennen logisch
(aufbauend auf einem (temporal-)logischen Kalkuel) oder auf
Zusicherungen aufbauen. Letztere koennen allerdings nur
Sicherheitseigenschaften verifizieren. Beispiele fuer die
einzelnen Spezifikationsformen werden gegeben (Petri Netze,
Milners CCS, erweiterte endliche Automaten). Das Buch ist
insgesamt sehr gut lesbar und auch fuer Einsteiger in das Gebiet
durchaus geeignet, vor allem, weil es auf Deutsch ist.} }
@INCOLLECTION{Lamport:1990:DCM,
AUTHOR = "Leslie Lamport and Nancy Lynch",
TITLE = "Distributed computing: models and methods",
BOOKTITLE = "Handbook of Theoretical Computer Science
(Volume B: Formal Models and Semantics)",
PUBLISHER = "Elsevier",
YEAR = 1990,
CHAPTER = 18,
PAGES = "1157--1199",
NOTE = "J. van Leeuwen, Editor",
annote = ""
}
@Book{Lee:1990:FTP,
author = "Peter A. Lee and Thomas Anderson",
title = "Fault Tolerance: Principles and Practice",
series = "Dependable computing and fault-tolerant systems",
publisher = pub-SV,
address = "Berlin ; New York",
year = "1990",
edition = "Second",
annote = "[to read]",
}
@Article{Leveson:1990:USC,
author = "Nancy G. Leveson and Stephen S. Cha and John C. Knight
and Timothy J. Shimeall",
title = "The Use of Self Checks and Voting in Software Error
Detection: An Empirical Study",
journal = "IEEE Transactions on Software Engineering",
volume = "16",
number = "4",
pages = "432--443",
year = "1990",
abstract = "This paper presents the results of an empirical study
of software error detection using self checks and
N-version voting. A total of 24 graduate students in
computer science at the University of Virginia and the
University of California, Irvine, were hired as
programmers. Working independently, each first prepared
a set of self checks to an existing implementation of
that specification. The modified programs were executed
to measure the error-detection performance of the
checks and to compare this with error detection using
simple voting among multiple versions. The goal of this
study was to learn more about the effectiveness of such
checks. The analysis of the checks revealed that there
are great differences in the ability of individual
programmers to design effective checks. We found that
some checks which might have been effective failed to
detect an error because they were badly placed, and
there were numerous instances of checks signaling
nonexistent errors. In general, specification-based
checks alone were not as effective as combining them
with code-based checks. using self checks, faults were
identified that had not been detected previously by
voting 28 versions of the program over a million
randomly-generated inputs. This appeared to result from
the fact that the self checks could examine the
internal state of the executing program whereas voting
examines only the final results of computations. If
internal states had to be consistent in N-version
voting systems, then there would be no reason to write
multiple versions. The programs were executed on 100
000 new randomly-generated input cases in order to
compare error detection by self checks and by 2-version
and 3-version voting. Both self checks and voting
techniques led to the identification of the same number
of faults for this input, although the identified
faults were not the same. Furthermore, whereas the self
checks were always effective at detecting an error
caused by a particular fault (if they ever did),
N-version voting triples and pairs were only partially
effective at detecting the failures caused by
particular faults. Finally, checking the internal state
with self checks also resulted in finding faults that
did not cause failures for the particular input case
executed. This has important implications for the use
of back-to-back testing.",
note = "29 refs",
}
@Article{Mullender:1990:ADO,
author = "Sape J. Mullender and Guido {van Rossum} and Andrew S.
Tanenbaum and Robbert {van Renesse} and Hans {van
Staveren}",
title = "{Amoeba}: {A} Distributed Operating System for the
1990s",
journal = j-IEEE-COMPUTER,
volume = "23",
number = "5",
pages = "44--53",
month = may,
year = "1990",
abstract = "Amoeba is the distributed system developed at the Free
University (VU) and the Centre for Mathematics and
Computer Science (CWI), both in Amsterdam. Throughout
the project's ten-year history, a major concern of the
designers has been to combine the research themes of
distributed systems, such as high availability, use of
parallelism and scalability, with simplicity and high
performance. Distributed systems are necessarily more
complicated than centralized systems, so they have a
tendency to be much slower. Amoeba was always designed
to be used, so it was deemed essential to achieve
extremely high performance. The Amoeba software is
based on objects. An objects is a piece of data on
which well-defined operations may be performed by
authorized users, independent of where the user and
object are located. Objects are managed by server
processes and named using capabilities chosen randomly
from a sparse name space. Processes consist of a
segmented address space shared by one or more threads
of control. Processes can be created, managed, and
debugged remotely. Operations on objects are
implemented using remote procedure calls. Amoeba has a
unique and fast file system. The file system is split
into two parts --- the Bullet Service, which stores
immutable files contiguously on the disk and the SOAP
Directory Service, which provides a mechanism for
giving capabilities symbolic names. The directory
server also handles replication and atomicity,
eliminating the need for a separate transaction
management system.",
annote = "[to read]"
}
@Article{Neiger:1990:AIF,
author = "Gil Neiger and Sam Toueg",
title = "Automatically Increasing the Fault-Tolerance of
Distributed Algorithms",
journal = "Journal of Algorithms",
year = "1990",
volume = "11",
number = "3",
pages = "374--419",
annote = "Say you have designed a distributed algorithm in a
synchronous (round based) system that tolerates crash failures using
reliable communication. Can you mechanically derive a protocol which
does the same thing and also tolerates send-omission, general
omission or arbitrary failures? Yes you can, and Neiger and Toueg
show you how to do it. The authors define a so-called translation,
i.e. a function $T$ that converts a protocol $P_b$ to a protocol
$P_s$. $P_b$ is correct when running in a system subject to a
``benign'' failure model $b$, and $P_s=T(P_b)$ is supposed to be
correct in a system subject to a more severe failure model
$s$. Correctness means that $P_s$ has the same set of histories as
$P_b$ when you inspect only that part of the state which also exists
in $P_b$. Also, only the states after a fixed numer $c$ are
inspected (i.e. they speak of a $c$-phase simulation). Formally, a
translation from a system $S_b$ to a system $S_s$ is given by a
history simulation function $H$ with the following properties: (a)
$H$ maps histories of a protocol running in $S_s$ to histories in
$S_b$ and these histories are valid. (b) $H$ preserves the
correctness of processors, i.e. a processor correct in $S_s$ is also
correct in $S_b$ (not necessarily vice versa), (c) the states from a
history in $S_b$ appear in steps of $c$ in the history of
$S_s$. This refers to the states of all processors (this must be
weakened when investigating translations to the byzantine failure
model; there they only refer to the state of the correct
processors). A translated protocol solves some problem if its
translated histories solve the original problem specification. The
authors continue to present translations from crash to send-omission
and then from crash to general-omission. The idea is to insert
additional rounds of communication and let processors which do a
general omission crash themselves. Because the number of faulty
processors in both systems is $t$, such a translation is
possible. When dealing with arbitrary failures, the properties of
the translation function are weakened (see above). Translations are
presented which use a validated reliable broadcast primitive to be
able to detect byzantine behavior and pretend that the bad processes
crashed. Some lower bounds are proved as well. Overall a
well-readable paper despite the formalisms and the proofs. It is
interesting how the original correctness specifications are
transformed into systems with a more severe failure model: with
crash the specification stays the same (since we are in a
synchronous environment this is possible \cite{Gaertner:1999:ESD}),
with Byzantine we restrict the correctness to the set of correct
processes. Are there intermediate steps?"
}
@Article{Nelson:1990:FTC,
author = "Victor P. Nelson",
title = "Fault-tolerant computing: fundamental concepts",
OPTcrossref = "",
OPTkey = "",
journal = j-IEEE-COMPUTER,
year = "1990",
volume = "23",
number = "7",
pages = "19--25",
month = jul,
OPTnote = "",
annote = "The author first defines the usual terms (fault,
error, failure, fault classes, availability,
dependability, reliability) and then reviews common
elements of strategies in fault tolerance with focus on
hardware. The elements are fault masking, fault
detection, fault containment, diagnosis,
repair/reconfiguration, recovery. He elaborates on
error detection/masking/correction (using codes),
self-checking logic, module replication, timing
checks, fault containment, reconfiguration, repair
and recovery. He only handles masking fault tolerance
(indicating that safety is more important than liveness
\cite{Kreitz:1998:SWL}). An insightfull paper where
the ideas come from the obvious strive to organize
the material more strictly. This is a task Nelson
initiates, but has seemingly not aimed at."
}
@Article{Ramanathan:1990:FCS,
author = "Parameswaran Ramanathan and Kang G. Shin and Ricky W.
Butler",
title = "Fault-Tolerant Clock Synchronization in Distributed
Systems",
journal = "Computer",
volume = "23",
number = "10",
pages = "33--42",
month = oct,
year = "1990",
abstract = "Software algorithms are suitable only where loose
synchronization is acceptable, and hardware algorithms
are expensive. A hybrid scheme achieves reasonably
tight synchronization and is cost-effective.",
keywords = "Computer Software--Applications; Computer Systems,
Digital; Computers, Digital--Synchronization;
Consistency algorithms; Convergence-averaging;
Convergence-nonaveraging; Distributed; Distributed
systems; Fault tolerant clock synchronisation;
Fault-Tolerant Systems; Hardware synchronization
algorithms; Hybrid synchronization; Probabilistic
synchronization; Software synchronization algorithms;
Synchronization Algorithms; Worst-case clock skews",
annote = "[to read]"
}
@Book{Raynal:1990:SCD,
author = "Michel Raynal and Jean-Michel Helary",
title = "Synchronization and Control of Distributed Systems and
Programs",
series = "Wiley Series in Parallel Computing",
pages = "124",
publisher = "John Wiley \& Sons",
address = "New York",
year = "1990",
keywords = "book, text,",
abstract = "** Description ** The mastery of distributed
applications demands a complete understanding of the
foundations of the distributed algorithm. The object of
this book is to present these foundations as they
relate to synchronization--the key element of
parallelism and distribution. Divided into four
chapters, it explores the different types of
synchronization that may be encountered in a parallel
application and presents the concept of wave and
several of its possible implementations. Synchronous
and asynchronous sytems and their relationships are
described, as well as the concept of the
synchronization phase, its properties, and its use.\par
** Contents ** Different Forms of Synchronization
between Processes. The Concept of a Wave, and
Synchronization by Wave Sequence. Synchronization by
Logic Pulsing. Synchronization by Phases. Appendices.
References. Index.\par ** Market ** Engineers,
Researchers, Professors and Students of Engineering.",
note = "F-0-471-92453-9 1990cloth \$84.95",
}
@Article{Schneider:1990:IFS,
author = "Fred B. Schneider",
title = "Implementing fault-tolerant services using the state
machine approach: {A} tutorial",
OPTcrossref = "",
OPTkey = "",
journal = j-ACM-COMP-SURVEYS,
year = 1990,
volume = 22,
number = 4,
pages = "299--319",
month = dec,
OPTnote = "",
annote = "The state machine approach enhances the
fault-tolerance properties of a system by
replicating nodes and coordinating the actions of
these nodes (and the communication to and from them).
The replica group thus acts as a single state
machine, but now a certain number and kind of faults
can be tolerated. This paper presents this approach
and also discusses reconfiguration techniques. This
is the paper to cite for the term ``state machine
appoach''."
}
@Article{VanGasteren:1990:ANI,
author = {A. J. M. van Gasteren and Gerard Tel},
title = {Comments on ``on the proof of a distributed
algorithm'': always true is not invariant},
journal = ipl,
year = 1990,
OPTkey = {},
volume = 35,
OPTnumber = {},
month = "September",
pages = "277--279",
OPTnote = {},
annote = "a paper which explains the intrinsic difference
between the notions of ``invariant'' and
``always-true''. A predicate $P$ is an invariant (1)
if $P$ holds in every initial state of a system, and
(2) $P$ is not falsified by any action of the
system. A predicate $P$ is always true if $P$ holds
in every reachable state of the system. This means, an
invariant is always true, but the converse is
not valid. Example: a program which has one
variable $k$ (initially 0) and one action ``if $k=1$
then $k:=2$''. Consider the predicate $P\equiv
k<2$. Then $P$ is always true for the program, but
$P$ is not an invariant, because the action does not
maintain $P$. The authors argue that invariance is
more useful because it is maintained by program
composition."
}
@Article{Abadi:1991:ERM,
author = {Mart{\'\i}n Abadi and Leslie Lamport},
title = {The Existence of Refinement Mappings},
journal = {Theoretical Computer Science},
year = {1991},
OPTkey = {},
volume = {82},
number = {2},
pages = {253--284},
month = may,
OPTnote = {},
url = "http://www.research.digital.com/SRC/personal/Martin_Abadi/Papers/tcs.ps",
annote = {Programs and specifications are viewed as formulas of the
same logic (originally an idea of \cite{Pnueli:1981:TSC} explained
in \cite{Abadi:1995:CS,Abadi:1993:CS,Lamport:1989:SAS}). The
semantics of such a formula is the set of executions $\phi$ produced
by that formula. A program $p_1$ implements $p_2$ if $\phi(p_1)$
implies $\phi(p_2)$. That $p_1$ implements $p_2$ can be shown by
exhibiting a refinement mapping which relates the actions of $p_1$
to those of $p_2$. However, the validity of the implication does not
guarantee that such a mapping exists. A refinement mapping between
state spaces $S_1$ and $S_2$ can be used to prove that a state
machine $\Sigma_1$ using states from $S_1$ implements a state
machine $\Sigma_2$ using states from $S_2$. The main result of this
paper is the following theorem: If $\Sigma_1$ implements $\Sigma_2$
then one can add history and prophecy variables to the specification
of $\Sigma_1$ to find a refinement mapping from $S_1$ to $S_2$. The
assumptions to prove this theorem are: (1) $S_1$ is machine closed,
i.e. the ``liveness'' property of $\Sigma_1$ does not imply
additional safety properties. (2) $\Sigma_2$ has finite invisible
nondeterminism, i.e. external steps of $\Sigma_2$ must be finitely
representable internally, and (3) $\Sigma_2$ is internally
continuous, i.e. a not-allowed behavior can be determined by looking
at the externally visible behavior plus only a finite part of the
internal behavior. Other proved propositions are: any safety
property has a specification with finite invisible nondeterminism,
any safety property is internally continuous, and any property has a
machine closed specification. The result shows that it is always
possible to prove safety using refinement mappings, if not
liveness.}
}
@Article{Arora:1991:MDC,
author = "A. Arora and S. Dolev and M. Gouda",
title = "Maintaining digital clocks in step",
journal = "Parallel Processing Letters",
volume = "1",
number = "1",
pages = "11--18",
month = sep,
year = "1991",
keywords = "clocks; N-clock; simultaneously triggered clocks;
stabilisation; stability; system",
annote = "The authors present a design for achieving exact
synchronization of bounded digital clocks in
synchronous (i.e., lock-step) systems like digital
circuits. The approach is an early example of
applying the closure and convergence paradigm to
problems, resulting in two self-stabilizing
solutions: (1) a fall back solution, where a node
periodically checks the clocks of its neighbours and
falls back to a minumum value if values differ; and
(2) a catch up solution where a maximum value is
taken. The protocols are simple, uniform and
distributed. The stabilization time is in the oder
of the degree of the nodes times the diameter of the
network. Overall, this is a paper unmistaken in the
clarity and enjoyment of exposition and style,
gladly to be read."
}
@INPROCEEDINGS{Awerbuch:1991:SLC,
AUTHOR = "Baruch Awerbuch and Boaz Patt-Shamir and George Varghese",
TITLE = "Self-stabilization by local checking and
correction",
BOOKTITLE = "FOCS91 Proceedings of the 31st Annual IEEE
Symposium on Foundations of Computer Science",
YEAR = 1991,
PAGES = "268--277",
annote = "[to write]"
}
@InProceedings{Chandra:1991:UFD,
author = "Tushar Deepak Chandra and Sam Toueg",
title = "Unreliable failure detectors for asynchronous systems",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "325--340",
booktitle = pro-podc91,
year = "1991",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
OPTmonth = "",
OPTnote = "",
annote = "The authors introduce the concept of a failure
detector to battle the impossibility of consensus in
asychronous systems. Failure detectors allow to make
guesses on which computers are still alive and which
are not in the network. They are classified and
applied to consensus and atomic broadcast. See also
the journal version of this paper
\cite{Chandra:1996:UFD}. First reference to the concept."
}
@ARTICLE{Chen:1991:SAC,
AUTHOR = "Nian-Shing Chen and Hwey-Pyng Yu and Shing-Tsaan Huang",
TITLE = "A self-stabilizing algorithm for constructing
spanning trees",
JOURNAL = j-IPL,
VOLUME = 39,
YEAR = 1991,
PAGES = "147--151"
}
@Article{Cooper:1991:CDG,
author = "Robert Cooper and Keith Marzullo",
title = "Consistent detection of global predicates",
OPTcrossref = "",
OPTkey = "",
journal = j-SIGPLAN,
year = "1991",
volume = "26",
number = "12",
pages = "167--174",
month = dec,
OPTnote = "",
annote = "Citable definition of possibly(P) and definitely(P)."
}
@Article{Cristian:1991:UFD,
author = {Flaviu Cristian},
title = {Understanding fault-tolerant distributed systems},
journal = j-CACM,
year = 1991,
volume = 34,
number = 2,
month = feb,
pages = "56--78",
annote = "Describes traditional approach to fault-tolerant
computing: failure models, failure semantics,
fault-tolerance, architectural issues, standard
systems, masking failures, hardware and software
fault-tolerance."
}
@Book{Echtle:1990:F,
author = {Klaus Echtle},
title = {Fehlertoleranzverfahren},
publisher = pub-SV,
year = 1990,
annote = "Echtle's well-known book on fault tolerance strategies.
Ist in der Bib. Inf. vorhanden"
}
@InProceedings{Arora:1991:MDS,
title = "Maintaining Digital Clocks In Step",
author = "Anish Arora and Shlomi Dolev and Mohamed G. Gouda",
booktitle = "Distributed Algorithms, 5th International Workshop",
editor = "Sam Toueg and Paul G. Spirakis and Lefteris M.
Kirousis",
address = "Delphi, Greece",
month = "7--9~" # oct,
year = "1991",
series = ser-LNCS,
volume = "579",
publisher = pub-SV,
ISBN = "ISBN 3-540-55236-7",
pages = "71--79",
annote = "[to get]"
}
@InProceedings{Flatebo:1991:SLB,
author = "Mitchell Flatebo and Ajoy Kumar Datta",
title = "Self-stabilizing load balancing for an arbitrary network",
OPTcrossref = "",
OPTkey = "",
editor = "J. Wu and W. Gao and J. Yang and Y. Li",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "743--746",
booktitle = "ICYCS-93: Young Computer Scientists. Proceedings of
the Third International Conference",
year = "1993",
OPTorganization = "",
publisher = "Tsinghua University Press",
address = "Beijing, China",
month = "July",
OPTnote = "",
annote = "[who can get a hand on this?]"
}
@ARTICLE{Gouda:1991:AP,
AUTHOR = "Mohamed G. Gouda and Ted Herman",
TITLE = "Adaptive programming",
JOURNAL = j-IEEE-TRANS-SOFTW-ENG,
VOLUME = 17,
NUMBER = 9,
MONTH = sep,
YEAR = 1991,
PAGES = "911--921",
annote = "Adaptive programs change their behaviour according
to changes in their environment. Environment changes
are assumed to be gradual and occur within
relatively short periods of time compared to long
periods of non-change. During periods of change an
adaptive program behaves arbitrarily and eventually
reaches a consistent behaviour if changes cease. The
authors define adaptivity in terms of a `secures'
relation: P secures Q in S means that if the
environment establishes an input predicate P, then
the program S will eventually reach a state where Q
holds. A program is adaptive, if all properties of
interest can be expressed using the secures
relation. Thus, adaptivity is a general form of
self-stabilization (which is ``true secures Q in
S''). But in self-stabilization, legal states are
usually defined in terms of internal
variables. In adaptive programs there can be changes
of the definition of legal states imposed by the
envrionment."
}
@ARTICLE{Gouda:1991:SCP,
AUTHOR = "Mohamed G. Gouda and Nicholas J. Multari",
TITLE = "Stabilizing communication protocols",
JOURNAL = j-IEEE-TRANS-COMP,
VOLUME = 40,
NUMBER = 4,
MONTH = apr,
YEAR = 1991,
PAGES = "448--458",
annote = "convergence stair presented"
}
@Article{Herlihy:1991:SGD,
author = {Maurice P. Herlihy and Jeannette M. Wing},
title = {Specifying graceful degradation},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = {1991},
OPTkey = {},
volume = {2},
number = {1},
pages = {93--104},
month = jan,
OPTnote = {},
annote = {The authors show how the ideal specification of a program
can be ``degraded'' in a structured way so that the
behavior of the program is still ``close'' to the ideal
specification if the environment (faults etc.) prohibits
the ideal specification to be satisfied. Processes and
the environment are modeled as finite state machines.
State transitions of the processes are called operations,
transitions of the environment are called events. The
combined automaton produces executions (sequences of
state/operation pairs). The ideal specificiation prescribes
a certain set of executions assuming a certain state of
the environment. The environment ensures some properties
called constraints. Events cause these constraints to be
violated, resulting in an `enlarged' behavior of the
combined automaton. Depending on the set of constraints
guaranteed by the environment, the combined automaton
satisfies a weaker specification than the ideal
specification. The constraints induce a lattice on
the set of specifications of the automaton. This allows
a designer to specify system behavior in the presence
of violated constraints. (Cases where this can arise
in practice are faults, timing violations or security
breaches.) The method (called the lattice relaxion method)
it makes environmental assumptions explicit and enables
you to specify unwanted but sometimes unavoidable
cases of system performance. Let's see how specifications
can be systematically parametrized to yield such a
lattice. [This paragraph was written in a state of
partial sleep deliria; do not infer the quality of
the paper from this text. In fact, the paper is
very deep and interesting.] A similar idea is mentioned
in \cite{Schepers:1993:TFT}. Does not cite
\cite{Weber:1989:FSF}.}
}
@PhdThesis{Herman:1991:ATD,
author = {Ted Herman},
title = {Adaptivity through distributed convergence},
school = {Department of Computer Science, University of Texas
at Austin},
year = {1991},
OPTkey = {},
OPTaddress = {},
OPTtype = {},
OPTmonth = {},
OPTnote = {},
OPTannote = {nicht kopiert/ausgedruckt}
}
@InProceedings{Liskov:1991:PUS,
author = "Barbara Liskov",
title = "Practical Uses of Synchronized Clocks in Distributed
Systems",
pages = "1--10",
ISBN = "0-89791-439-2",
editor = "Luigi Logrippo",
booktitle = pro-podc91,
address = "Mont{\'e}al, Qu{\'e}bec, Canada",
month = aug,
year = "1991",
publisher = pub-ACM,
annote = "Discusses several uses of synchronized clocks in distributed
algorithms: SCMP protocol (a protocol that achieves at-most-once
semantics of messages), tickets in Kerberos, and several forms of
leases \cite{Gray:1989:LEF} for maintaining replica consistency. The
starting point to remeber is that assumptions about clock rates are
always probabilistic and so assumptions about synchronization should
only affect the performance not the correctness of a protocol. In
general, time is used to achieve liveness, e.g. a server requests
a replica to give up its lease; it waits either until the replica
replies or its lease expires. The final paragraph contains some
ideas on how to transform an algorithm not relying on synchronized
clocks into more efficient versions using synchronized clocks: (1)
identify messages which could be avoided using timestamps, (2)
if message exchange is already reduced, find ways to save storage
using timestamps (e.g. purge storage after $t$ seconds)."
}
@PhdThesis{Liu:1991:FTP,
author = {Zhiming Liu},
title = {Fault-tolerant programming by transformations},
school = {University of Warwick, Department of Computer Science},
year = {1991},
OPTkey = {},
OPTtype = {},
OPTaddress = {},
OPTmonth = {},
OPTnote = {},
annote = {Published and extended in many forms
\cite{Liu:1992:TPF,Liu:1993:SVR,Liu:1994:SDF,Liu:1995:FFF} and
\cite{Liu:1996:VFR,Liu:1998:SVF} but seemingly the only reference to
the term ``finite error behavior'' (p. 27).}
}
@InProceedings{Long:1991:SRI,
author = {Darrel D. E. Long and John L. Carroll and C. J. Park},
title = {A study of the reliability of {Internet} sites},
booktitle = pro-srds91,
OPTcrossref = {},
OPTkey = {},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
year = {1991},
OPTorganization = {},
OPTpublisher = {},
OPTaddress = {},
month = sep,
pages = {177--186},
OPTnote = {},
annote = {to read}
}
@Book{Manna:1991:TLR,
author = {Zohar Manna and Amir Pnueli},
title = {The temporal logic of reactive and concurrent systems:
Specification},
publisher = pub-SV,
year = {1991},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {See also \cite{Manna:1995:TVR}.}
}
@InProceedings{Marzullo:1991:DGS,
author = "Keith Marzullo and Gil Neiger",
title = "Detection of global state predicates",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "254--272",
booktitle = pro-wdag91,
year = "1991",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
OPTmonth = "",
OPTnote = "",
OPTannote = ""
}
@TechReport{Marzullo:1991:TCD,
author = "Keith Marzullo and Mark D. Wood",
title = "Tools for Constructing Distributed Reactive Systems",
institution = "Dept.\ of Computer Science, Cornell University",
year = "1991",
number = "TR 91-1193",
address = "Ithaca, New York ({USA})",
month = feb,
annote = "mentions sensor/actuator approach [to read]"
}
@Article{Oezveren:1991:SSD,
title = "Stability and Stabilizability of Discrete Event
Dynamic Systems",
author = "C{\"u}neyt M. {\"O}zveren and Alan S. Willsky and
Panos J. Antsaklis",
area = "Theory of Computation",
pages = "730--752",
journal = j-ACM,
month = jul,
year = "1991",
volume = "38",
number = "3",
general-terms = "Algorithms, Design, Languages, Reliability, Theory",
keywords = "Reliability, self-stabilizing systems, stability,
stabilizability, state feedback",
cr-categories = "F.2.2 [computations on discrete structures \and
sequencing and scheduling]; F.4.3 [algebraic language
theory \and classes defined by grammars or automata];
G.2.2 [graph algorithms]; G.4 [algorithm analysis \and
reliability and robustness]; H.2.8; J.7 [command and
control \and process control]",
annote = "[to read]"
}
@Article{Peleska:1991:DVF,
author = "Jan Peleska",
title = "Design and Verification of Fault Tolerant Systems with
{CSP}",
pages = "95--106",
journal = j-DC,
volume = "5",
number = "2",
year = "1991",
publisher = pub-SV,
annote = "A case study in proving a hot standby system correct using
CSP. The proof method is like in CSP and proves refinements down
several levels to the implementation. At some lower level, crash
faults are introduced and masked by a redundant component together
with a reconfiguration procedure. It seems as if faults and fault
actions are modeled explicitly. Conversly to \cite{Peled:1994:CFF},
refinement steps are constructed by hand instead of using
correctness preserving transformations (this is advocated as
``invent and verify'' which is claimed to suit industry). You need
to know good CSP to really understand the text. If only parts of the
system properties may be proved, this is noted to be something like
graceful degradation."
}
@Article{Ralston:1991:FMH,
author = {T. J. Ralston and S. L. Gerhart},
title = {Formal methods: {History}, practice, trends and prognosis},
journal = {American Programmer},
year = {1991},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTpages = {},
month = may,
OPTnote = {},
annote = {[to get], cited in \cite{Glass:1999:RST} as the only study
which has produced hard numbers on the benefit of
applying formal methods in software engineering.}
}
@InProceedings{Sanders:1991:PTA,
author = "Beverly Sanders",
title = "A Predicate Transformer Approach to Knowledge and
Knowledge-based Protocols",
pages = "217--22",
ISBN = "0-89791-439-2",
editor = "Luigi Logrippo",
booktitle = pro-podc91,
address = "Mont{\'e}al, Qu{\'e}bec, Canada",
month = aug,
year = "1991",
publisher = "ACM Press",
annote = "[to read]"
}
@Article{Swade:1991:CCB,
author = "D. Swade",
title = "The construction of {Charles Babbage's} difference
engine.",
journal = "Annals of the History of Computing.",
volume = "13",
number = "1",
pages = "82--83",
year = "1991",
keywords = "Babbage, difference engine",
abstract = "Science Museum UK is building Babbage's difference
engine (not his analytic engine which \~{} computer) to
celebrate 200-th anniversary of Babbage's death (1771).
4000 parts, 3 tons, 10x6x1.5 feet Being built in
materials and with accuracy of Babbage's day. The D.E.
calculates 7th order polynomials to 30 decimal
places.",
annote = "Bowen \cite{Bowen:1993:SCS} cites another text by
Swade towards the concerns of Charles Babbage about
the `table crisis' which lead to the development
of the difference engine."
}
@Book{Tel:1991:TDA,
author = {Gerard Tel},
title = {Topic in Distributed Algorithms},
publisher = {Cambridge University Press},
year = {1991},
OPTkey = {},
OPTvolume = {},
number = {1},
series = {Cambridge International Series on Parallel Computation},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
OPTannote = {}
}
@PhdThesis{Arora:1992:FFC,
author = {Anish Kumar Arora},
title = {A foundation of fault-tolerant computing},
school = {The University of Texas at Austin},
year = {1992},
OPTkey = {},
OPTaddress = {},
OPTtype = {},
month = dec,
OPTnote = {},
annote = "Arora's thesis defines fault tolerance as the result
of two conditions: closure and convergence. Closure
means that a system remains in a set of legal states
during normal system behaviour, convergence assures
that any fault (modelled as actions on an extended
state space \cite{Cristian:1985:RAF}) is eventually
tolerated by returning into the set of legal states
in finite time. This is a stabilizing notion of
fault tolerance, published in an IEEE conference
proceedings \cite{Arora:1993:CCF} and subsequently
enhanced into a theory of correctors and detectors,
a general theory of fault tolerance
\cite{Arora:1998:CDM}."
}
@Article{Beauquier:1992:TDP,
author = "Joffroy Beauquier",
title = "Two distributed problems involving {Byzantine}
processes",
journal = "Theoretical Computer Science",
volume = "95",
number = "1",
pages = "169--185",
day = "23",
month = mar,
year = "1992",
annote = "The author investigates two problems where processes
are subject to Byzantine behavior: the naming
problem (i.e., assigning unique names to processes)
and the mutual exclusion problem. For these problems
to be solvable, the author makes the following
minimal assumptions: if k is a strict upper bound on
the number of Byzantine processes, then the network
must be k-connected, meaning that there are at least
(k+1) disjoint paths between any two nodes. Also,
communication must be synchronous and the algorithm
must be non-uniform, i.e., there exists an
exceptional node (the initiator) which is
non-Byzantine. Access to public key cryptosystems is
assumed. For mutual exclusion, the Byzantine
processes may not hold the token arbitrarily long,
i.e., their behavior is correct once they are in
their critical section and during the exit sequence
from it. This leads to the definition of locally
Byzantine processes. Difficulties arise, because
Byzantine nodes may not generally be detected. They
may act normally when communicating to the outside
world forever. Naming is achieved through a kind of
echo algorithm that achieves safety through
backwards confirmation over a given path. Through
the k connectivity and communication synchrony it is
assured that valid information eventually reaches a
correct node and that this node can check the
information. The mutual exclusion algorithm bases on
Byzantine agreement. "
}
@InProceedings{Chandra:1992:WFD,
author = "Tushar Deepak Chandra and Vassos Hadzilacos and Sam
Toueg",
title = "The Weakest Failure Detector for Solving Consensus",
pages = "147--158",
editor = "Maurice Herlihy",
booktitle = pro-podc92,
address = "Vancouver, BC, Canada",
month = aug,
year = "1992",
publisher = "ACM Press",
annote = "Continuing work \cite{Chandra:1991:UFD} shows that a
certain form of failure detector is the weakest one
necessary to solve consensus. See also journal
version of this paper \cite{Chandra:1996:WFD} and
other papers on this subject
\cite{Chandra:1996:UFD,Chandra:1991:UFD}."
}
@Article{Hariri:1992:ASD,
author = "Salim Hariri and Alok Choudhary and Behcet Sarikaya",
title = "Architectural Support for Designing Fault-Tolerant
Open Distributed Systems",
journal = j-IEEE-COMPUTER,
volume = "25",
number = "6",
pages = "50--62",
month = jun,
year = "1992",
annote = "[to read]"
}
@TechReport{Heimerdinger:1992:CFS,
author = {Walter L. Heimerdinger and Chuck B. Weinstock},
title = {A conceptual framework for system fault tolerance},
institution = {Software Engineering Institute},
year = {1992},
OPTkey = {},
OPTtype = {},
number = {CMU/SEI-92-TR-33},
address = {Carnegie Mellon University, Pittsburgh, PA},
month = oct,
OPTnote = {},
annote = {A good introductory text to the traditional concepts and
issues in fault-tolerant computing (in the lines of
\cite{Laprie:1992:DBC}) targeted at egnineers and
practicioners. Defines a system, dependability specifications
(repeats the $10^{-9}$ reliability rate of commercial aircraft,
states the problems with implicit and explicit specifications
commented on by David Powell in Madeira), failure modes, faults
vs. failures (fault is the failure of a subcomponent, avoids the
term error), defines failure regions as oppsed to fault regions
(vertical vs. horizontal perspective). Enumerates fault tolerance
mechanisms (mainly redundancy management) and gives informal
definitions of time and space redundancy, which are said to be
necessary, not sufficient. The conclusions contain a set of 5 rules
for the practitioner how to start off building reliable
systems. Everything is underlined with running examples from bridge
building and computer systems.}
}
@Article{Huang:1992:SSA,
author = "Shing Tsaan Huang and Nian Shing Chen",
title = "A self-stabilizing algorithm for constructing
breadth-first trees",
journal = j-IPK,
volume = "41",
number = "2",
pages = "109--117",
day = "14",
month = feb,
year = "1992",
coden = "IFPLAT",
ISSN = "0020-0190",
mrclass = "68M15",
mrnumber = "93a:68017",
bibdate = "Wed Nov 11 12:16:26 MST 1998",
acknowledgement = ack-nhfb,
affiliation = "Natl Tsing Hua Univ",
affiliationaddress = "HsinChu, Taiwan",
classification = "723; 921; C1160 (Combinatorial mathematics); C4240
(Programming and algorithm theory)",
corpsource = "Inst. of Comput. Sci., Nat. Tsing Hua Univ., HsinChu,
Taiwan",
journalabr = "Inf Process Lett",
keywords = "algorithm theory; bounded function; Breadth First
Trees; breadth-first trees; computation step; Computer
Programming --- Algorithms; Fault Tolerant Software;
Mathematical Techniques; rules; self-stabilizing
algorithm; Self-Stabilizing Algorithms; Trees; trees
(mathematics)",
treatment = "T Theoretical or Mathematical",
annote = "[to get] Difference to \cite{Chen:1991:SAC}?"
}
@Book{Isermann:1992:IDS,
author = {Rolf Isermann},
ALTeditor = {},
title = {{Identifikation dynamischer Systeme}},
publisher = pub-SV,
year = {1992},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Berlin},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {[Angabe von Armin]}
}
@Book{Laprie:1992:DBC,
ALTauthor = {},
editor = {Jean-Claude Laprie},
title = {Dependability: {Basic} concepts and Terminology},
publisher = pub-SV,
year = {1992},
OPTkey = {},
volume = {5},
OPTnumber = {},
series = {Dependable Computing and Fault-Tolerant Systems},
OPTaddress = pub-SV:adr,
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {A joint attempt to unify dependability terminology and
present it in 5 languages! Great! Maybe based on
\cite{Laprie:1985:DCF}.}
}
@article{Liu:1992:TPF,
author = {Zhiming Liu and Mathai Joseph},
title = {Transformation of Programs for Fault-tolerance },
journal = {Formal Aspects of Computing },
volume = { 4 },
number = { 5 },
year = { 1992 },
pages = { 442--469 },
annote = {``The task is then to develop programs which perform
predictably in the presence of {\emph detected} system failures, and
this requires the representation of such failures in the execution
of a program.'' (p.443) This must be done at different levels of
abstraction so it is good to use the same formalism for
specifications and programs. The formalism used is close to TLA
\cite{Lamport:1994:TLA} with its state based specification method
\cite{Lamport:1989:SAS} and notion of refinement
\cite{Abadi:1991:ERM}. Physical faults are modeled as actions that
transform a good state into an error state which may lead to a
violation of the specification. If such an action occurs, a boolean
variable $f$ is set to true (modeling fault detection by underlying
hardware). Faults cannot destruct program operations at the lowest
level of abstraction. The fault affected version of a program $P$ is
obtained by a transformation $F$ which is assumed to mimic fail-stop
semantics \cite{Schlichting:1983:FSP}, i.e. once a fault action is
executed, no further regular program actions occur. To make the
fault affected version satisfy the original specification $S$, a
fault-tolerant transformation $T$ must be applied such that
$F(T(P))$ satisfies $S$. Usually, the transformed version will
satisfy a weaker specification. $T$ is modeled as adding recovery
actions which are only enabled when $f$ is true. It is assumed that
recovery actions are not affected by faults. Overall $F(T(P))$ can
then be shown to be equivalent to the parallel execution of program,
fault and recovery actions. Fairness guarantees eventual
recovery. At a step during the refinement process where there is
sufficient information about the fault environment (such as the
number of faulty processes/channels), then the recovery
transformation can be devised. A specification language for action
systems similar to \cite{Chandy:1988:PPD} and a notion of
satisfiability between program and specification is
devised. (Section 4:) The failure semantics of a program $P$
regarding a fault set $F$ are the set of executions of $P$ augmented
by possible executions of actions from $F$. As $F$ is fail-stop,
this results in a sequence of good states followed by an empty or
infinite sequence of bad states. A fault transformation is defined
which changes every command construct to result in the failure
semantics of the program. With this formal definition of the
transformation it is actually proved that $F(P)$ is parallel
execution of $P$ and fault program. Fault transformations are
transferred to sets of processes. Section 5 defines consistency and
recovery transformations, the latter in analogy to fault
transformations. Section 6 defines fault refinement, proves some
properties of it and recovery transformations, and also proves some
useful rules when refining programs to make them fault tolerant. A
protocol for reliable communication is used as an example for the
method. Interestingly, a variable $b$ is used in the fault program
which guarantees the finiteness of consecutive faults. Overall,
safety and progress properties can now be proved. The discussion
(Sect. 8) states that the highest level of fault environment is the
transition of $f$ from false to true. The next level action system
is then an action which assigns $f$ the value true. Subsequent
refinement steps must introduce more information about the system
and its faults as the levels on which the faults occur are
reached. It is an open question whether for any program and any
fault model the value of $f$ can be derived at some point during the
refinement process?! Handling real-time is an open question and is
handled in later papers \cite{Liu:1996:VFR}. The idea of modeling
faults as actions is attributed to
\cite{Cristian:1985:RAF}. Overall, the paper is concise and
rigorously formal, so at first reading many of the ideas and not
readily visible. I had to read it twice, and after second reading
like this text very much.}
}
@Misc{Mills:1992:NTP,
OPTkey = {},
author = {David L. Mills},
title = {Network Time Protocol (Version 3)},
howpublished = {Internet Request for Comments RFC 1305},
year = {1992},
month = mar,
OPTnote = {},
OPTannote = {to get}
}
@PhdThesis{Nordahl:1992:SDD,
author = {Jens Nordahl},
title = {Specification and Design of Dependable Communicating
Systems},
school = {Department of Computer Science, Technical University of
Denmark},
year = {1992},
OPTkey = {},
OPTtype = {},
OPTaddress = {},
OPTmonth = {},
OPTnote = {},
OPTannote = {[to get] coins the terms local and global fault assumptions
discussed in \cite{Liu:1995:FFF}.}
}
@Article{Ostroff:1992:FMS,
author = "Jonathan S. Ostroff",
title = "{Survey of Formal Methods for the Specification and
Design of Real-Time Systems}",
journal = "Journal of Systems and Software",
volume = "18",
number = "2",
pages = "33--60",
month = apr,
year = "1992",
annote = "An extensive survey of real-time programming languages,
visual modelling languages, and most notably logics and algebras for
specifying and verifying real-time systems. Real-time programming
languages mostly only have delay and timeout mechanisms but lack
formal semantics. Petri Nets are graphical modeling languages. An
overview explains the different time semantics (linear, branching)
and real-time temporal logics. Contains 144 references."
}
@InProceedings{Powell:1992:FMA,
author = "David Powell",
title = "Failure Mode Assumptions and Assumption Coverage",
pages = "386--395",
ISBN = "0-8186-2875-8",
editor = "Dhiraj K. Pradhan",
booktitle = "Proceedings of the 22nd Annual International Symposium
on Fault-Tolerant Computing ({FTCS} '92)",
address = "Boston, MA",
month = jul,
year = "1992",
publisher = "IEEE Computer Society Press",
annote = "[to read]"
}
@InProceedings{Rushby:1992:FSV,
author = "John Rushby",
editor = "J. Vytopil",
title = "Formal Specification and Verification of a
Fault-Masking and Transient-Recovery Model for Digital
Flight-Control Systems",
booktitle = "Formal Techniques in Real-Time and Fault-Tolerant
Systems 2nd International Symposium",
series = ser-LNCS,
volume = "571",
pages = "237--258",
publisher = pub-SV,
address = "Nijmegen, The Netherlands",
year = "1992",
annote = "[to read] appears also under the same name in a book
of the same name published by the same editor in 1993
(Kluwer).]"
}
@Book{Siewiorek:1992:RCS,
author = "Daniel Siewiorek and Robert Swarz",
title = "Reliable Computer Systems: Design and Evaluation",
publisher = "Digital Press",
year = "1992",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
OPTaddress = "",
OPTedition = "",
OPTmonth = "",
OPTnote = "",
OPTannote = "[get it?]"
}
@InProceedings{Shankar:1992:MVG,
title = "Mechanical Verification of a Generalized Protocol for
{Byzantine} Fault-Tolerant Clock Synchronization",
author = "Natarajan Shankar",
booktitle = "Formal Techniques in Real-Time and Fault-Tolerant
Systems",
editor = "J. Vytopil",
pages = "217--236",
publisher = pub-SV,
series = "Lecture Notes in Computer Science",
volume = "571",
month = jan,
year = "1992",
address = "Nijmegen, The Netherlands",
annote = "[to read]"
}
@Article{Turek:1992:MFC,
author = {John Turek and Dennis Shasha},
title = {The many faces of consensus in distributed systems},
journal = j-IEEE-COMPUTER,
year = {1992},
volume = {25},
number = {6},
month = jun,
pages = {8--17},
OPTnote = {},
annote = "A rewarding (because well written) paper on the
different shades of (im)possibility of consensus in
distributed systems. Starts with the parable of La
Tryste, notes general settings in which consensus is
(im)possible in message passing systems (synchrony
of processors, message order, communication delay,
transmission method), relates results to shared
memory settings, sketches Fischer, Lynch and
Pattersons result \cite{Fischer:1985:IDC}, proves
impossibility of Byzantine agreement in message
passing settings without signatures. Concludes:
Global knowledge is much stronger than local
knowledge."
}
@Article{Zhao:1992:SAB,
title = "A Self-Adjusting Algorithm for {Byzantine} Agreement",
author = "Yi Zhao and Farokh B. Bastani",
journal = j-DC,
pages = "219--226",
year = "1992",
volume = "5",
number = "4",
annote = ""
}
@Article{Abadi:1993:CS,
author = {Mart{\'\i}n Abadi and Leslie Lamport},
title = {Composing Specifications},
journal = j-TOPLAS,
year = {1993},
OPTkey = {},
volume = {15},
number = {1},
pages = {73--132},
month = jan,
OPTnote = {},
annote = {A ground- and breathtaking paper on the difficulties
arising when composing specifications of subsystems to get a
specification of the composed system. It is a formal investigation
of the exact formulation of the composition principle for
concurrent systems. A system is here something that interacts with
an environment over a well-defined boundary. A specification of a
system here is a set of behaviors at the boundary where the
environment and the system alternately take steps. Steps of the
system can contain stuttering steps (i.e., steps where the state
of the interface does not change) and the environment makes the
first move. A specification can be expressed by $E\Rightarrow M$
where $E$ is an assumption about the environment and $M$ is the
property guaranteed by the system. This is the understanding of
the transition-axiom approach \cite{Lamport:1989:SAS}. The
composition principle states that the composition $S$ of systems
$S_1,\ldots,S_n$ satisfy a specification $E\Rightarrow M$ if three
conditions hold: (1) $S$ guarantees $M$ if every $S_i$ guarantees
its own $M_i$. (2) If $E$ holds and every $S_i$ guarantees $M_i$
then $E_i$ holds for every $S_i$. (3) Every $S_i$ guarantees $M_i$
if $E_i$ holds. There is an obvious circularity here because
every component is part of the environment of the other. The main
result states that the composition principle is valid if the
environment assumptions are safety properties. The paper contains
a lot of insightfull discussions about related aspects of
specifications and programming: state vs. action based formalisms
are compared in section 1.1, the distinction between system and
environment is treated in sections 1.2 and 1.3. Section 3 contains
an elaborated discussion on realizability of specifications and
Section 4 details on the form of a specification. It examines what
makes up a complete or a partial program and what difficulties
arise in composition. For example, progress properties are
inherent part of programs, but are often states implicitly as an
incrementation of the program counter or fairness
assumptions. Formally, progress properties are defined using the
term `machine realizable', meaning something like `it doesn't add
additional safety properties'. A specification then is a formula
$I\cap E_S\cap E_L\Rightarrow M_S\cap M_L$, where
$E_S$ and $M_S$ are the safety properties of the environment and
the system respectively, and $E_L$ and $M_L$ are their liveness
properties; $I$ is an initial state predicate on the environment
state. Theorem 1 shows that $E_L$ can be incorporated into the
system's liveness property, resulting in a specification being
$I\cap E_S\Rightarrow M_S\cap (E_L\Rightarrow
M_L)$. This means that the prerequisistes of the composition
principle are achievable. Section 4.4 states also that everything
can be moved to the right hand side of the implication. Then the
specification does not only specify wanted behaviour but also
allows arbitrary behavior if $E_S$ is not met. The authors argue
that this is impractical.}
}
@ARTICLE{Afek:1993:SSU,
AUTHOR = "Yehuda Afek and Geoffrey M Brown",
TITLE = "Self-stabilization over unreliable communication
media",
JOURNAL = dc,
VOLUME = 7,
YEAR = 1993,
PAGES = "27--34"
}
@ARTICLE{Arora:1993:CCF,
AUTHOR = "Anish Arora and Mohamed Gouda",
TITLE = "Closure and convergence: {A} foundation
of fault-tolerant computing",
JOURNAL = j-IEEE-TRANS-SOFTW-ENG,
VOLUME = 19,
NUMBER = 11,
YEAR = 1993,
PAGES = "1015--1027"
}
@Article{Arora:1993:CIS,
author = "Anish Arora and Paul Attie and Michael Evangelist
and Mohamed Gouda",
title = "Convergence of iteration systems",
OPTcrossref = "",
OPTkey = "",
journal = j-DC,
year = "1993",
volume = "7",
number = "1",
pages = "43--53",
OPTmonth = "",
OPTnote = "",
OPTannote = ""
}
@INPROCEEDINGS{Anagnostou:1993:TTP,
AUTHOR = "Efthymios Anagnostou and Vassos Hadzilacos",
TITLE = "Tolerating transient and permanent Failures",
BOOKTITLE = pro-wdag93,
YEAR = 1993,
PAGES = "174--188",
annote = "The authors investigate the classes of problems
which are solvable in the presence of transient and
permanent failures. They begin by stating the
self-stabilization has been the domain of research
on tolerating transient failures which manifest
themselves as arbitrary memory corruptions. On the
other hand, fault tolerance has focussed on
permanent failures such as process crashes. While
transient failures could effect all processes,
permanent failures were restrcicted to a certain
subset of processes (usually half or one third of
all processes). The authors show that tolerating
transient and permanent failures is impossible in
asynchronous systems for all problems which are
``failure sensitive''. Failure sensitive problems
are such that it is vitally important to know whether a
process has crashed or not. Examples for failure
sensitive problems are leader election, consensus
and spanning tree construction. As an example for a
solvable problem they give an algorithm for unique
naming in ring networks. These results give insight
into the fundamental distinction between transient
and permanent failures: transient failures are
detectable in asynchronous systems, permanent ones
are not. But the impossibility results are not too
devastating since election and consensus are
unsolvable in asynchronous systems anyway
\cite{Fischer:1985:IDC}."
}
@InCollection{Babaoglu:1993:CGS,
author = "{\"O}zalp Babao\u{g}lu and Keith Marzullo",
title = {Consistent global states of distributed systems:
{Fundamental} concepts and mechanisms},
booktitle = {Distributed Systems},
crossref = {Mullender:1993:DS},
OPTkey = {},
publisher = pub-AW,
year = {1993},
editor = {Sape Mullender},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
chapter = {4},
OPTtype = {},
OPTaddress = {},
edition = {Second},
OPTmonth = {},
pages = {55--96},
OPTnote = {},
annote = "A well written survey on the theory of consistent
global states. It is well suited as an introductory
text for lectures on causality, distributed
computations, snapshots, observations and predicate
detection. A more research oriented text is
\cite{Schwarz:1994:DCR}."
}
@Article{Barborak:1993:CPF,
author = "Michael Barborak and Anton Dahbura and Miroslaw Malek",
title = "The consensus problem in fault-tolerant computing",
OPTcrossref = "",
OPTkey = "",
journal = j-ACM-COMP-SURVEYS,
year = "1993",
volume = "25",
number = "2",
pages = "171--220",
month = jun,
OPTnote = "",
annote = "This paper surveys research on the consensus
problem, comparing and unifying the two traditional
approaches, which are (1) system diagnosis and (2)
the Byzantine Generals Problem (BGP). Approach (1)
tries to reach a constistent state by letting nodes
diagnose eachother and infer from the resulting set
of results (together with additional assumptions)
which nodes are faulty. These nodes can then be
avoided or shut down. Approach (2) applies
distributed algorithms that reach nontrivial
consensus on a single value in spite of possibly
malicious faulty nodes within the network. Fault are
thus masked. The paper is a near-to-complete survey
of research up to about 1993, rather technical but
exact."
}
@INPROCEEDINGS{Berrou:1993:NSL,
AUTHOR = "C. Berrou and A. Glavieux and P. Thitimajshima",
TITLE = "Near {S}hannon Limit Error-Correcting Coding and
Decoding: Turbo Codes",
PAGES = "1064-1070",
booktitle = "IEEE Int. Conf. on Communications (ICC-1993)",
year = 1993,
annote = "the basic reference to the term `turbo codes'."
}
@Article{Birman:1993:PGA,
author = "K. P. Birman",
title = "The Process Group Approach to Reliable Distributed
Computing",
journal = j-CACM,
volume = "36",
number = "12",
pages = "36--53",
year = "1993",
OPTkeywords = "ISIS, process groups, replicated processes",
}
@Article{Bowen:1993:SCS,
author = {Jonathan Bowen and Victoria Stravridou},
title = {Safety-critical systems, formal methods and standards},
journal = {IEE/BCS Software Engineering Journal},
year = {1993},
OPTkey = {},
volume = {8},
number = {4},
pages = {189--209},
month = jul,
OPTnote = {},
annote = {A well-written survey of the use of formal methods
in industry for the design and implementation of
safety critical systems as of 1992 (should be read
in conjunction with \cite{Rushby:1994:CSP}). A source
for lots of citations on the importance of dependability
and ways to achieve it. I especially like the introduction
``Human lives have depended on mathematical calculations
for centuries\ldots'' where Babbage is shown to be one
of the first researchers in computer dependability.
Gives examples in the fields of aviation, railway systems,
nuclear power plants, medical systems, ammunition
control and embedded microprocessors. Standards are
rather UK centric. Great bibliography.}
}
@InProceedings{Diehl:1993:RAD,
author = "Claire Diehl and Claude Jard and Jean-Xavier Rampon",
title = "Reachability Analysis on Distributed Executions",
pages = "629--643",
year = "1993",
month = apr # "~13--17,",
editor = "Marie-Claude Gaudel and Jean-Pierre Jouannaud",
booktitle = "Proceedings of the 4th International Joint Conference
on Theory and Practice of Software Development
{TAPSOFT}'93",
address = "Orsay, France",
series = ser-LNCS,
number = "668",
publisher = pub-SV,
annote = "[to read]"
}
@ARTICLE{Dolev:1993:SDS,
AUTHOR = "Shlomi Dolev and Amos Israeli and Shlomo Moran",
TITLE = "Self-stabilization of dynamic systems
assuming only read/write atomicity",
JOURNAL = j-DC,
VOLUME = 7,
YEAR = 1993,
PAGES = "3--16",
annote = "describes fair protocol combination, i.e.,
composition of self-stabilizing protocols."
}
@InProceedings{Dolev:1993:WCS,
title = "Wait-Free Clock Synchronization (Extended Abstract)",
author = "Shlomi Dolev and Jennifer L. Welch",
pages = "97--108",
booktitle = pro-podc93,
address = "Ithaca, New York, USA",
month = aug,
year = "1993",
annote = "The problem solved is the following: build an
algorithm that guarantees that for some fixed $k$ a
processor P which has been working correctly for $k$
time units (and as long as it continues to work
correctly) satisfies: (1) P's clock ticks normally
(i.e., it is not adjusted), and (2) P's clock agrees
with the clocks of all other processes which have
been working correctly for the last $k$ time
periods. The algorithm should handle any form of
transient failures as well as ``napping'' failures,
i.e., processors stop operation for arbitrary long
times and then resume work without noticing that
they have stopped. A protocol that achieves this
goal in the presence of napping failures is called
wait-free. The authors present four such algorithms
for different system settings (non/assumption of
global pulse, global/local read/write atomicity
etc.). Two of these protocols are both wait-free and
self-stabilizing. Clocks seem to be unbounded. "
}
@Book{Freyermuth:1993:WFB,
author = {B. Freyermuth},
ALTeditor = {},
title = {{Wissensbasierte Fehlerdiagnose am Beispiel eines
Industrieroboters}},
publisher = {VDI-Verlag},
year = {1993},
OPTkey = {},
OPTvolume = {},
number = {315},
series = {Fortschr.-Ber. VDI Reihe 8},
address = {{D\"usseldorf}},
OPTedition = {},
OPTmonth = {},
note = {Dissertation TH Darmstadt},
annote = {[Angabe von Armin]}
}
@InProceedings{Gopal:1993:USF,
author = {Ajei S. Gopal and Kenneth J. Perry},
title = {Unifying self-stabilization and fault-tolerance},
booktitle = pro-podc93,
year = {1993},
publisher = {ACM Press},
pages = {195--206},
annote = "The authors explore the possibility of building
protocols that tolerate transient (``systemic'') as
well as permanent (``process'') failures. They
arrive at similar conclusions as Anagnostou and
Hadzilacos \cite{Anagnostou:1993:TTP}: there are no
protocols that can solve general problems in finite
stabilization time because it is impossible to
distinguish a crashed process from one that
continually experiences send omission failures. Even
more, the process which cannot send messages does
not know whether it can communicate or not
because of its inability to determine how it arrived
at its present state. It is however possible to
solve problems if ``solvability'' is restricted to
the communicating (or functioning) subset of
processes. These results count for synchronous
(round based) protocols. The idea of problem solving
in the presence of transient faults is: never
terminate and regularly purge your computation
history. In the paper, also asynchronous systems are
examined: the authors present a self-stabilizing
eventually strong failure detector based on an
eventually weak failure detector. This failure
detector can help solve consensus in transient fault
environments. It uses unbounded counters and
resembles very much the Heartbeat failure detector
\cite{Aguilera:1997:HTF}."
}
@article{Gumm:1993:AGA,
author = "H. Peter Gumm",
title = {Another glance at the {Alpern-Schneider} characterization
of safety and liveness in concurrent executions},
journal = j-IPL,
volume = "47",
number = "6",
pages = "291--294",
year = "1993",
url = "citeseer.nj.nec.com/gumm93another.html",
annote = "Revisits the Alpern-Schneider result \cite{Alpern:1985:DL}
on ``every property is the intersection of a safety and liveness
property'' in a more abstract setting: The result is restated in the
context of a meet-preserving map between two complete Boolean
algebras. The theorem is more general than Alpern-Schneider since
it allows a new application in a simplified setting of UNITY
style logics \cite{Chandy:1988:PPD}: safety properties are those
where a set of transitions is forbidden. This is similar to the
fusion-closedness assumption on specifications of
\cite{Arora:1998:CDM}."
}
@InCollection{Hadzilacos:1993:FTB,
author = {Vassos Hadzilacos and Sam Toueg},
title = {Fault-tolerant broadcasts and related problems},
booktitle = {Distributed Systems},
crossref = "Mullender:1993:DS",
OPTkey = {},
publisher = pub-AW,
year = {1993},
editor = {Sape Mullender},
chapter = {5},
edition = {Second},
pages = {97--145}
}
@ARTICLE{Katz:1993:SEM,
AUTHOR = "Shmuel Katz and Kenneth J. Perry",
TITLE = "Self-stabilizing extensions for message-passing
systems",
JOURNAL = j-DC,
VOLUME = 7,
YEAR = 1993,
PAGES = "17--26",
annote = "[to write]"
}
@InCollection{Kopetz:1993:RTD,
author = {Hermann Kopetz and Paulo Ver{\'\i}ssimo},
title = {Real Time and Dependability Concepts},
booktitle = {Distributed Systems},
crossref = {Mullender:1993:DS},
OPTkey = {},
pages = {411--446},
publisher = pub-AW,
year = {1993},
editor = {Sape Mullender},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTtype = {},
chapter = {16},
OPTaddress = {},
edition = {Second},
OPTmonth = {},
OPTnote = {},
annote = {gives informal overview over (among others) redundancy}
}
@InProceedings{Kurshan:1993:VM6,
author = "R. P. Kurshan and L. Lamport",
booktitle = "Proceedings of the 5th International Conference on
Computer Aided Verification",
year = "1993",
editor = "C. Courcoubetis",
address = "Elounda, Greece",
series = ser-LNCS,
volume = "697",
publisher = pub-SV,
title = "Verification of a Multiplier: 64 Bits and Beyond",
pages = "166--179",
}
@InCollection{Liu:1993:SVR,
author = {Zhiming Liu and Mathai Joseph},
title = {Specification and verification of recovery in
asynchronous communicating systems},
booktitle = {Formal Techniques in Real-time and Fault-tolerant Systems},
OPTcrossref = {},
OPTkey = {},
pages = {137--165},
publisher = {Kluwer},
year = {1993},
editor = {Jan Vytopil},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTtype = {},
chapter = {6},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {A nice presentation of transformation based
fault-tolerance verification similar to \cite{Peled:1994:CFF}. The
paper first nicely presents the formal prerequisites (states and
behaviors, specifications and programs, refinement, asymchronous
communication). Faults are modeled as a set of fault actions and a
fault transformation, fault-tolerant refinement are defined as in
\cite{Liu:1996:VFR,Liu:1995:FFF} although I like the presentation
here most. Detection is not covered here; an error variable flags
the detection of a physical fault. Fault tolerance is achieved by
another form of transformation exemplified for the class of
checkpointing and backward-recovery programs. Consistent checkpoints
and rollback operations to the most recent checkpoint are treated in
length and some Theorems about the sufficiency of this method are
proved (reminds me of the optimality proof of
\cite{Singhal:1995:OPA}). Failure during recovery is discussed: if
recovery is fault-tolerant or not subject to faults, then recovery
can be assumed atomic. Failures within recovery can be handled by
restarting recovery when they are detected. Failures during
checkpointing are handlable if we assume that there is at least one
(initial) checkpoint available to which rollback is possible. The
conclusions state that backward recovery will result in the
satisfaction of a degraded specification. In open systems the
repeated communication with the environment must not be
neglected. The method of fault modeling is attributed to
\cite{Cristian:1985:RAF}. This paper is seen as a generalization of
this work and that of \cite{Schlichting:1983:FSP}.}
}
@InProceedings{Li:1993:FTD,
author = {Pei-yu Li and Bruce McMillin},
title = {Fault-Tolerant Distributed Deadlock Detection/Resolution},
booktitle = {Proceedings of the 17th Annual International Computer
Software and Applications Conference (COMPSAC'93)},
OPTcrossref = {},
OPTkey = {},
pages = {224--230},
year = 1993,
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
month = nov,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {Also University of Missouri, Rolla, Department of Computer
Science Technical Report Number CSC 92-04. This paper takes the
fault diagnosis approach to failure detection (look at
\cite{Barborak:1993:CPF} for an intro to system diagnosis). I'm
not sure how this relates to the standard interpretation of
failure detectors \cite{Chandra:1996:UFD}, but here eventually
every correct process knows the identities of all failed
processes. A deadlock detection algorithm is proposed using a
priority based probe approach to find cycles in the wait-for graph
of an application. It can only detect deadlocks if there is at
most one process failure in a deadlock cycle (a result is cited
why being better is not possible). I don't see where this
restriction comes from. Overall a nice text giving a somewhat different
view of detecting stable predicates. Does not cite \cite{Shah:1984:DSS}
although that paper also does deadlock detection.}
}
@InProceedings{Lincoln:1993:FVA,
title = "Formal Verification of an Algorithm for Interactive
Consistency under a Hybrid Fault Model",
author = "Patrick Lincoln and John Rushby",
booktitle = "Computer-Aided Verification, CAV '93",
editor = "Costas Courcoubetis",
pages = "292--304",
publisher = pub-SV,
series = "Lecture Notes in Computer Science",
volume = "697",
month = jun # "/" # jul,
year = "1993",
address = "Elounda, Greece",
annote = "Good cite for the term `hybrid fault model'."
}
@Book{Mullender:1993:DS,
editor = "Sape Mullender",
title = "Distributed Systems",
publisher = pub-AW,
edition = "Second",
year = 1993,
annote = "An excellent collection of substantial papers not
only on the theoretical foundations of distributed
systems (although these chapters are especially
rewarding)."
}
@Article{Neiger:1993:SSC,
title = "Simulating Synchronized Clocks and Common Knowledge in
Distributed Systems",
author = "Gil Neiger and Sam Toueg",
area = "Distributed Computing",
pages = "334--367",
journal = "Journal of the ACM",
month = apr,
year = "1993",
volume = "40",
number = "2",
annote = "[to read]"
}
@InProceedings{Nordahl:1993:DFD,
author = {Jens Nordahl},
title = {Design for dependability},
booktitle = {Proceedings of the third IFIP International Working
Conference on Dependable Computing for Critical
Applications (DCCA-3)},
OPTcrossref = {},
OPTkey = {},
pages = {29--38},
year = {1993},
editor = {Carl E. Landwehr},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTmonth = {},
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {Nordahl shows how to verify that a system consisting of
subcomponents can be proved correct in the presence of component
failures. Three concepts are basic: (1) design, (2) correctness of
design and (3) failure mode. A system is a process (here, CSP
\cite{Hoare:1984:CSP} is used as a formailsm throughout; we use the
terminology of Lamport \cite{Lamport:1989:SAS} in this
annotation). A specification is a property. A process $P$ implements
a specification $S$ if all executions of $P$ are contained in $S$. A
distinction between processes and specifications is made but it is
remarked that this is not a central requirement (so other formalisms
such as \cite{Pnueli:1981:TSC} can be used). A system can consist of
a collection of subsystems (or components). A design determines how
the components interact. (1) A design is a tuple consisting of a
function mapping $n$ systems to a (new) system and a set of $n$
subsystem specifications. The function can be some composition
operator (parallel, sequential etc. and compositions of these
operators). A design contains enough information to reason about the
corresponding hierarchical level. (2) A design is correct regarding
a specification $S$ iff the combined system satisfies $S$ whenever
the subcomponents satisfy their specification. (3) A failure mode is
a specification describing the behavior of a system when it is
faulty. Such a failure mode may be given by a component designer
when making assumptions at design time or it may be derived by an
engineer from observing faulty system behavior at runtime. For
example, a failure mode for Byzantine behaviour is the predicate
true. Components can have several failure modes, and for $n$
components this is expressed as an $n$-tuple $(F_1,\ldots,F_n)$ of
sets of failure modes. For one combination of failure modes
$(S_1,\ldots,S_n)$ one can prove that a design involving these
subcomponents is correct regarding some system specification
$S$. This can be extended to cover all possible combinations of
component failure modes (e.g. to show that the system satisfies $S$
in any case). Two notions of fault-tolerance are defined: masking
fault tolerance (calles `fault-tolerance') and fail-softness. A
system design is fault-tolerant if it is correct regarding an
$n$-tuple of component failure modes and the original correctness
specification $S$. Fail-softness is defined as fault tolerance where
$S$ is replaced by some weaker specification (which one to choose is
a pragmatic issue, says Nordahl). The proof of correctness of design
and fault tolerance can now be performed in the same logical
framework as before. An example (stand by spare system) is given and
proved. The conclusions discuss the following aspects: (a) the
faulty behavior of a components is not given as a ``delta'' of its
original correctness specification and a description of faulty
behavior, but rather as a ``finished'' specification (i.e. a failure
mode). Another approach is to calculate the weakened specification
from the original specification and a failure model (such methods
are \cite{Liu:1992:TPF,Peleska:1991:DVF,Gaertner:1999:ESD}). The
disadvantage of the calculational approach is the necessity of
calculations and the restrictions imposed on specifying faulty
behavior. (I think both are equivalent.) (b) compositionality is
achieved by defining fault tolerance of a design as a function of a
single combination of subcomponent failure modes. Global assumptions
about what combinations may arise can be dealt with at a higher
level. (c) calculating the likelihood of failure can be integrated
into the method quite easily by associating probabilistic measures
to combinations of failure modes. Overall this is a very concise
and well-written paper.}
}
@InProceedings{Ricciardi:1993:UPN,
author = "Aleta Ricciardi and {Andr\'e} Schiper and Kenneth Birman",
title = "Understanding partitions and the ``no partition''
assumption",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
OPTpages = "",
booktitle = "Proceedings of the 4th Workshop on Future Trends of
Distributed Computing Systems (FTDCS-4)",
year = "1993",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
OPTmonth = "",
OPTnote = "",
annote = "[to read]"
}
@InProceedings{Schepers:1993:CPT,
author = "R. Gerth and H. Schepers",
title = "A Compositional Proof Theory for Fault Tolerant
Real-Time Distributed Systems",
pages = "34--43",
booktitle = "Symposium on Reliable Distributed Systems ({SRDS}
'93)",
month = oct,
publisher = "IEEE Computer Society Press",
address = "Los Alamitos, Ca., USA",
year = "1993",
ISBN = "0-8186-4310-2",
annote = "[to get] Extends the work of \cite{Schepers:1994:TCP}
to real time."
}
@InProceedings{Schepers:1993:TFT,
author = {Henk Schepers},
title = {Tracing Fault Tolerance},
booktitle = {Proceedings of the third IFIP International Working
Conference on Dependable Computing for Critical
Applications (DCCA-3)},
OPTcrossref = {},
OPTkey = {},
pages = {39--48},
year = {1993},
editor = {Carl E. Landwehr},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTmonth = {},
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {Basis to this paper is a system of sequential processes
communicating via synchronous unidirected channels much in the sense
of CSP \cite{Hoare:1984:CSP}. The semantics of a process are the set
of possible message sequences (called histories or behaviors) at its
observable channels. The set of all histories is divided into
normal, exceptional and catastrophic bevaviors. Normal and
exceptional ones are acceptable, and only these are covered by fault
tolerance mechanisms. Catastrophic behaviors fall outside of the
fault hypothesis. A fault hypothesis is a reflexive relation on
histories defining how the fault changes the history (i.e. a
relation on normal behaviors and exceptional behaviors). A set of
behaviors with respect to a fault hypothesis is obtained by
augmenting the original set of traces according to the fault
hypothesis relation. Reflexivity ensures that only traces are added
(none removed) from the original set of traces. To prove that a
system tolerates some fault hypothesis one must show, that the
composition of the original system running under some fault
hypothesis and some tolerance mechanism satisfies the original
correctness specification. The examples given are a communication
channel which may lose or corrupt messages, and a ``stable
disk''. Only safety properties are investigated. The conclusions
contain a good survey of formal methods in fault tolerance up to
1993: Christian \cite{Cristian:1985:RAF} is cited as the first to
separate normal specification from tolerance
specification. Formalisms in which faults are treated explicitly are
\cite{Weber:1989:FSF,Joseph:1987:PRF,Peleska:1991:DVF}. The final
sentence is: ``We currently investigate modeling graceful
degradation as switching to another, less ambitious, set of
acceptable histories.'' For this, see \cite{Herlihy:1991:SGD}.
An extended version appeared as \cite{Schepers:1994:TCP}.}
}
@InProceedings{Schiper:1993:VSC,
author = "{Andr\'e} Schiper and Aleta Riccardi",
title = "Virtually-synchronous communication based on a weak
failure suspector",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "534--543",
booktitle = pro-ftcs93,
year = "1993",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
OPTmonth = "",
OPTnote = "",
annote = "A (quite mind-blowing) paper with lots of notation
and definitions on how to implement a
group membership service with certain semantics in
asynchronous environments. The information that a
process has about the functional states of the other
processes in the group is called its
view. Membership services that allow only a single
view to exist in the system are said to have linear
semantics. Those which allow concurrent views have
either weak-partial (views may overlap) or strong
partial (views may not overlap) semantics. The paper
shows that strong partial semantics are related to
virtually synchronous communication (VSC), however, an
intuitive definition of VSC is not readily
given. The authors propose a three-component
architecture for implementing VSC in asynchronous
systems: a weak failure suspector forms the basis
for a view and a mulicast component, which interact
on a higher level. The failure suspector has weak
completeness and the accuracy is ensured by having
either forcefully crashing the suspected process or
by ensuring that the suspected process equally
suspects the suspecting process. Crashed processes
can recover but are thereafter new processes with
new process identities. The failure suspector used
here does not seem to fit into the scheme of Chandra
and Toeug \cite{Chandra:1996:UFD}."
}
@ARTICLE{Schneider:1993:SS,
AUTHOR = "Marco Schneider",
TITLE = "Self-stabilization",
JOURNAL = j-ACM-COMP-SURVEYS,
VOLUME = 25,
number = 1,
YEAR = 1993,
PAGES = "45--67",
annote = "Standard reference survey on self-stabilization,
nearly always cited together with Dijkstra
\cite{Dijkstra:1974:SSS}."
}
@InCollection{Schneider:1993:WGM,
author = {Fred B. Schneider},
title = "What good are models and what models are good?",
booktitle = "Distributed Systems",
OPTcrossref = {Mullender:1993:DS},
publisher = pub-AW,
year = 1993,
editor = {Sape Mullender},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
chapter = {2},
OPTtype = {},
OPTaddress = {},
edition = {Second},
OPTmonth = {},
pages = {17--26},
OPTnote = {},
OPTannote = {}
}
@PhdThesis{Varghese:1993:SLC,
author = {George Varghese},
title = {Self-stabilization by local checking and correction},
school = {MIT},
year = {1993},
OPTkey = {},
OPTaddress = {},
OPTtype = {},
OPTmonth = {},
note = {Published as Technical Report MIT/LCS/TR-583},
OPTannote = {to write}
}
@InCollection{Verissimo:1993:RTC,
author = {Paulo Ver{\'\i}ssimo},
title = {Real-time communication},
booktitle = {Distributed Systems},
crossref = {Mullender:1993:DS},
OPTkey = {},
pages = {447--490},
publisher = pub-AW,
year = {1993},
editor = {Sape Mullender},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTtype = {},
chapter = {17},
OPTaddress = {},
edition = {Second},
OPTmonth = {},
OPTnote = {},
annote = {u.a. defines steadyness und tightness}
}
@Article{Abadi:1994:OFR,
author = "Mart\'{\i}n Abadi and Leslie Lamport",
title = "An Old-Fashioned Recipe for Real Time",
journal = j-TOPLAS,
volume = "16",
number = "5",
pages = "1543--1571",
month = sep,
year = "1994",
url = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/186058.html",
abstract = "Traditional methods for specifying and reasoning about
concurrent systems work for real-time systems. Using
TLA (the temporal logic of actions), we illustrate how
they work with the examples of a queue and of a
mutual-exclusion protocol. In general, two problems
must be addressed: avoiding the real-time programming
version of Zeno's paradox, and coping with
circularities when composing real-time
assumption/guarantee specifications. Their solutions
rest on properties of machine closure and
realizability.",
keywords = "theory; verification",
subject = "{\bf F.3.1}: Theory of Computation, LOGICS AND
MEANINGS OF PROGRAMS, Specifying and Verifying and
Reasoning about Programs, Specification techniques.
{\bf D.2.4}: Software, SOFTWARE ENGINEERING, Program
Verification, Correctness proofs.",
annote = "[to get]"
}
@Article{Afek:1994:RCU,
title = "Reliable Communication Over Unreliable Channels",
author = "Yehuda Afek and Hagit Attiya and Alan Fekete and
Michael Fischer and Nancy Lynch and Yishay Mansour and
Dai-Wei Wang and Lenore Zuck",
pages = "1267--1297",
journal = "Journal of the ACM",
month = nov,
year = "1994",
volume = "41",
number = "6",
annote = "[to read]"
}
@InProceedings{Alur:1994:FF,
title = "Finitary Fairness",
author = "Rajeev Alur and Thomas Henzinger",
pages = "52--61",
booktitle = "Proceedings, Ninth Annual {IEEE} Symposium on Logic in
Computer Science",
year = "1994",
month = "4--7 " # jul,
address = "Paris, France",
organization = "IEEE Computer Society Press",
references = "{STOC::AlurAT1994} {JACM::BrachaT1985}
{JACM::DworkLS1988} {JACM::FischerLP1985}
{JACM::PeaseSL1980}",
annote = "Introduces the term finitary fairness: requires that for
every run f the system there is an unknown bound $k$ such that no
enabled transition is postponed more than $k$ consecutive
times. Cited and discussed in \cite{Merritt:1998:FSO}."
}
@INPROCEEDINGS{Arora:1994:CSB,
AUTHOR = "Anish Arora and Mohamed G. Gouda and George Varghese",
TITLE = "Constraint satisfaction as a basis for
designing nonmasking fault-tolerance",
BOOKTITLE = pro-icdcs94,
YEAR = 1994,
PAGES = "424--431",
annote = "Important paper on self-stabilization
methodologies. Has many relations to Varghese's
thesis \cite{Varghese:1993:SLC}. Published as a more
citeable Journal version \cite{Arora:1996:CSB}."
}
@ARTICLE{Arora:1994:DR,
AUTHOR = "Anish Arora and Mohamed G. Gouda",
TITLE = "Distributed reset",
JOURNAL = j-IEEE-TRANS-COMP,
VOLUME = 43,
NUMBER = 9,
MONTH = sep,
YEAR = 1994,
PAGES = "1026--1038",
annote = ""
}
@InProceedings{Arora:1994:ERT,
author = "Anish Arora",
title = "Efficient Reconfiguration of Trees: {A} Case
Study in Methodical Design of Nonmasking
Fault-Tolerant Programs",
booktitle = "Proceedings of the 3rd International Symposium on
Formal Techniques in Real-Time and Fault-Tolerant
Systems (FTRTFTS'94)",
year = "1994",
editor = "{H. Langmaack} and {W.-P. de Roever} and {J.
Vytopil}",
pages = "110--127",
organization = "Organized Jointly with
the Working Group Provably Correct Systems-ProCoS",
volume = "863",
series = ser-LNCS,
publisher = pub-SV,
address = "L{\"u}beck, Germany",
month = sep,
annote = "An application of the method of constraint satisfaction
\cite{Arora:1994:CSB} to the problem of maintaining a rooted
spanning tree in a network of nodes that may failstop, recover and
where links may go down temporarily. Contains a brief discussion on
the benefits of nonmasking fault tolerance. Shows that the concept
of stabilization can handle ``permanent'' faults as well."
}
@INPROCEEDINGS{Awerbuch:1994:SLC,
AUTHOR = "Baruch Awerbuch and Boaz Patt-Shamir and George Varghese
and Shlomi Dolev",
TITLE = "Self-stabilizing by local checking and
global reset",
BOOKTITLE = pro-wdag94,
YEAR = 1994,
PAGES = "326--339",
annote = "to write"
}
@Article{Chandrasekar:1994:ASA,
author = "Srinivasan Chandrasekar and Pradip K. Srimani",
title = "A self-stabilizing algorithm to synchronize digital
clocks in a distributed system",
journal = "Computers and Electrical Engineering",
volume = "20",
number = "6",
year = "1994",
pages = "439--444",
annote = "Focusses on maintaining ``hardware'' clocks in step.
Takes the selb-stabilization view (as done by
\cite{Gouda:1990:SU,Arora:1991:MDS}). This means that nodes access
neighboring states by reading variables. Thus it abstracts from
message passing and physical clock drift."
}
@InProceedings{Cristian:1994:AFT,
author = "Flaviu Cristian",
title = "Abstractions for Fault-Tolerance",
pages = "278--286",
ISBN = "0-444-81988-6",
editor = "Karen Duncan and Karl Krueger",
booktitle = "Proceedings of the {IFIP} 13th World Computer
Congress. Volume 3 : Linkage and Developing Countries",
month = aug,
publisher = "Elsevier Science Publishers",
address = "Amsterdam, The Netherlands",
year = "1994",
annote = "The author presents some fundamental concepts of
fault tolerance and uses them to discuss several
current paradigms of fault tolerant computing. Basic
concepts include notions of service, server, the
depends-upon relation, failure classification,
failure semantics, failure masking by hierarchical
masking or by group masking. The fault tolerant
services discussed are: duplicated processors with
matching to provide crash failure semantics, error
detection/correction codes in stable storage to
provide read omission failure semantics, restartable
servers, point-to-point communication services,
distributed storage services, restartable services,
replicated storage and servers. Overall a paper
along the masking fault tolerance perspective as in
\cite{Cristian:1991:UFD}. When redundancy is not
available anymore, ``users must have some manegable
form of system behaviour that they can handle
without too much pain.'' Interesting are the two
laws of fault tolerance: First law: ``The stronger a
specified failure semantics, the more expensive and
complex it is to build a server that implements
it.'' Second law: ``The weaker the failure semantics
of members and communication, the more complex and
expensive the group management mechanisms become.''
Are these laws useful?"
}
@Article{Cristian:1994:CHW,
author = {Flaviu Cristian and Richard de Beijer and Shivakant Mishra},
title = {Comparing how well asynchronous atomic broadcast protocols
perform},
journal = {Distributed Systems Engineering Journal},
year = {1994},
OPTkey = {},
volume = {1},
number = {4},
pages = {177--201},
OPTmonth = {},
OPTnote = {},
annote = {[to read] Title of the TR: A performance comparison of
asynchronous atomic broadcast protocols.}
}
@InCollection{Cristian:1994:CSP,
author = "Flaviu Cristian and Houtan Aghili and Ray Strong",
editor = "Zhonghua Yang and T. Anthony Marsland",
title = "Clock Synchronization in the Presence of Omission and
Performance Failures, and Processor Joins",
booktitle = "Global States and Time in Distributed Systems, IEEE
Computer Society Press",
year = "1994",
annote = "A revised version of \cite{Cristian:1986:CSP}. Gives
a simplified version of the protocol of \cite{Dolev:1995:DFC},
possible by reducing the types of failures assumed to occur. Here,
only omission and performance failures are taken into account
that do not partition the network. The algorithm is based on the
paradigm of message diffusion. It assumes a maximum message
delivery delay and a bounded drift rate of hardware clocks.
It is mentioned that the MTTF of modern quartz clocks exceeds
15 to 25 years, military versions even of hundreds of years.
Overall, a paper showing that a weaker failure model results in
simpler protocols."
}
@INCOLLECTION{Flatebo:1994:SSD,
AUTHOR = "Mitchell Flatebo and Ajoy Kumar Datta and Sukumar Ghosh",
TITLE = "Self-stabilization in distributed systems",
BOOKTITLE = "Readings in Distributed Computing
Systems",
PUBLISHER = "IEEE Computer Society Press",
YEAR = 1994,
CHAPTER = 2,
PAGES = "100--114",
NOTE = "T.L. Casavant and M. Singal, Editors"
}
@Article{Garg:1994:DWU,
author = {V. K. Garg and Brian Waldecker},
title = {Detection of weak unstable predicates in distributed
programs},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = {1994},
OPTkey = {},
volume = {5},
number = {3},
OPTmonth = {},
pages = {299--307},
OPTnote = {},
annote = "Angaben aus \cite{Stoller:1997:DGP}."
}
@Article{Gouda:1994:SO,
author = "Mohamed G. Gouda",
title = "Stabilizing observers",
journal = "Information Processing Letters",
volume = "52",
number = "2",
pages = "99--103",
day = "28",
month = oct,
year = "1994",
keywords = "array of temperatures; boolean value; Convergence of
numerical methods; Distributed computer systems;
distributed processing; Error analysis; Fault tolerant
computer systems; Observability; performance
evaluation; Programmed control systems; sensors;
stability; stabilizing observers; Stabilizing
observers; Stabilizing phase synchronization; System
stability; Uni-directional token systems",
treatment = "P Practical; T Theoretical or Mathematical",
}
@TechReport{Hadzilacos:1994:MAF,
title = "A Modular Approach to Fault-Tolerant Broadcasts and
Related Problems",
author = "Vassos Hadzilacos and Sam Toueg",
number = "TR94-1425",
year = "1994",
month = may,
institution = "Cornell University, Computer Science Department",
pages = "83",
annote = "Looks like an extended paper version of the chapter in
Mullender's book on distributed systems
\cite{Mullender:1993:DS}. The contents: While
theoretical research in fault tolerant distributed
computing has focussed mainly on solving the
consensus problem, applied research has investigated
reliable broadcasts. The authors show that both
problems are closely related. They give several
precise semantics of fault models (Sect. 2.3, e.g.,
they model crash failure by introducing an
additional non-leavable crash state and
corresponding state transitions) and a good
definition of synchrony, asynchrony and partial
synchrony of models (Sect. 2.4). Timing failures are
also discussed (sec. 2.5). They develop a suite of
broadcast specifications and algorithms seperately
and in an incremental way which is very
instructive. Types of broadcasts are: reliable
broadcast, timed reliable broadcast, uniform
reliable broadcast (which places restrictions on the
operation of faulty processes) and certain order
specifications (FIFO, causal, atomic). Finally, the
relation between consensus and atomic broadcast is
investigated: they show that atomic broadcast can be
transformed into a consensus algorithm, and that
reliable broadcast and consensus yield atomic
broadcast (all in the time-free model with
crashes). The paper also discusses terminating
variants of reliable broadcast (where processes
deliver messages consistently even if they weren't
sent, e.g., as in Byzantine Agreement
\cite{Lamport:1982:BGP}) and multicast
specifications. Contains a reference to a
``forthcoming book'' on fundamentals of fault
tolerant distributed computing \cite{Hadzilacos:FFT}
which obviously has not been published yet. Overall
a very rewarding paper suited for introductory
courses on this topic."
}
@INPROCEEDINGS{Huang:1994:DEM,
AUTHOR = "Shing-Tsaan Huang and Lih-Chyau Wuu and Ming-Shin Tsai",
TITLE = "Distributed execution model for self-stabilizing
systems",
BOOKTITLE = "ICDCS94 Proceedings of the 14th International
Conference on Distributed Computing Systems",
YEAR = 1994,
PAGES = "432--439",
annote = "The authors introduce four categories of distributed
system models (serial, synchronous, synchronized and
distributed) and present a technique that makes
verification of algorithms in the distributed model
much easier once they have been proven correct for
the serial model. [what's the idea behind this?]"
}
@Book{Isermann:1994:UEF,
ALTauthor = {},
editor = {Rolf Isermann},
title = {{\"Uberwachung und Fehlerdiagnose --- Moderne
Methoden und ihre Anwendungen bei technischen Systemen}},
publisher = {VDI-Verlag},
year = {1994},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {{D\"usseldorf}},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {[Angabe von Armin]}
}
@Book{Jalote:1994:FDS,
author = "Pankaj Jalote",
title = "Fault tolerance in distributed systems",
publisher = pub-PH,
year = 1994,
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
address = pub-PH:adr,
OPTedition = "",
OPTmonth = "",
OPTnote = "",
annote = "Fine self-contained overview over the area of fault
tolerance in distributed systems. However, does not
mention self-stabilization with a single word."
}
@Article{Kindler:1994:SLP,
author = {Ekkart Kindler},
title = {Safety and Liveness Properties: {A} Survey},
journal = {EATCS-Bulletin},
year = {1994},
OPTkey = {},
OPTvolume = {},
number = {53},
OPTpages = {},
month = jun,
OPTnote = {},
annote = {A brief (4 page) and very concise survey on the differences
and historical evolution of different notions of safety
and liveness.},
url = "\url{http://www.informatik.hu-berlin.de/~kindler/PostScript/EATCS53.ps}"
}
@Article{Lamport:1994:HTW,
title = "How to Write a Long Formula (Short Communication)",
author = "Leslie Lamport",
journal = "Formal Aspects of Computing",
volume = "6",
number = "5",
pages = "580--584",
year = "1994",
url = "http://www.research.digital.com/SRC/personal/Leslie_Lamport/proofs/src119.dvi.Z",
annote = "Lamport proposes a structured and hierarchical way to write
long mathematical formulas. Nested parentheses are replaced by
proper indentation, and formulas with infix operators are used in
a prefix operator style if they are long. Also, the cases construct
and the use of definitions is discussed. The only unsurety is how
to write implications. This text previously appeared as DEC SRC
Research Report number 119.",
}
@InProceedings{Lamport:1994:SVF,
author = "Leslie Lamport and Stephan Merz",
title = "Specifying and Verifying Fault-Tolerant
Systems",
booktitle = "Formal Techniques in Real-Time and Fault-Tolerant
Systems",
year = "1994",
editor = "{H. Langmaack} and {W.-P. de Roever} and {J.
Vytopil}",
pages = "41--76",
OPTorganization = "Third International Symposium Organized Jointly with
the Working Group Provably Correct Systems-ProCoS",
volume = "863",
series = ser-LNCS,
publisher = pub-SV,
address = "L{\"u}beck, Germany",
month = sep,
annote = "An in-length exposition of a formal proof of the oral
messages algorithm to the Byzantine Generals Problem
\cite{Lamport:1982:BGP}. The problem is specified on three different
levels of abstraction: (1) a general and high level description of
the process' behaviors, given that they are loyal, (2) a mid-level
description containing the algorithm description, and (3) a
low-level description specifying how message exchange works. Proofs
are given that each lower level specification implements the next
higher level specification including the correctness theorem at the
mid-level: if at most one traitor exists, then the high level
specification is implemented by the mid level specification. It is
interesting that the global fault assumption appears at the
mid-level, which is conform with the fault-tolerant refinement idea
of \cite{Peled:1994:CFF}. The discussion contains some concrete
arguments to why TLA and hierarchically structured proofs can help
engineers prove systems correct up to an acceptable level of
trust. By introducing real-time, only safety properties need to be
proved, making aspects of the original Byzantine failure model more
explicit."
}
@Article{Lamport:1994:TLA,
author = "Leslie Lamport",
title = "{The Temporal Logic of Actions}",
journal = j-TOPLAS,
volume = "16",
number = "3",
pages = "872--923",
month = may,
year = "1994",
annote = "Main reference to the syntax, semantics and merits of
TLA. A good and increasingly exact overview starting from small
examples, introducing temporal operators, fairness, composition,
refinement, proof methods and rules, reasons not to use types,
hiding of variables and some very interesting comments on
mechanical verification, TLA vs. conventional programming
languages, and comparisons with related formalisms. For a shorter
introduction read \cite{Lamport:1994:ITT}."
}
@TechReport{Lamport:1994:ITT,
author = {Leslie Lamport},
title = {Introduction to TLA},
institution = {Digital Systems Research Center},
year = {1994},
OPTkey = {},
type = {Technical Note},
number = {1994-001},
address = {Palo Alto, CA},
month = dec,
OPTnote = {},
annote = {A short and instructive primer of TLA omitting all the
nitty gritty details. Starting point if you want to
specify programs in TLA fast. Standard reference is
\cite{Lamport:1994:TLA}.}
}
@InProceedings{Line:1994:MCS,
author = "JC Line and S Ghosh",
title = "A methodology for constructing a stabilizing
crash-tolerant application",
booktitle = pro-srds94,
year = "1994",
pages = "12--21",
annote = "[to read]"
}
@InProceedings{Line:1994:SAD,
author = "Jeffery C. Line and Sukumar Ghosh",
title = "Stabilizing Algorithms for Diagnosing Crash Failures",
pages = "376",
booktitle = pro-podc94,
month = aug,
year = "1994",
annote = "A simple stabilizing ``I am alive'' protocol is
presented for diagnosing a single crash failure in
at least strongly connected networks. The protocol
assumes channels with finite capacities and bounded
propagation delays. See also \cite{Arora:1995:TBS}."
}
@InCollection{Liu:1994:SDF,
author = {Zhiming Liu and Mathai Joseph},
title = {Stepwise Development of Fault-Tolerant Reactive
Systems},
booktitle = {Formal techniques in real-time and fault-tolerant systems},
OPTcrossref = {},
OPTkey = {},
pages = {529--546},
publisher = pub-SV,
year = {1994},
OPTeditor = {H. Langmaack and W.-P. de Roever and J. Vytopil},
OPTvolume = {},
number = {863},
series = ser-LNCS,
OPTtype = {},
OPTchapter = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {[to read]}
}
@InProceedings{Lo:1994:UFD,
title = "Using Failure Detectors to Solve Consensus in
Asynchronous Shared-Memory Systems (Extended
Abstract)",
author = "Wai-Kau Lo and Vassos Hadzilacos",
booktitle = pro-wdag94,
editor = "Gerard Tel and Paul M. B. Vit{\'a}nyi",
address = "Terschelling, The Netherlands",
month = "29~" # sep # "--1~" # oct,
year = "1994",
series = "Lecture Notes in Computer Science",
volume = "857",
publisher = pub-SV,
ISBN = "ISBN 3-540-58449-8",
pages = "280--295",
annote = "[to read]"
}
@Book{Lynch:1994:AT,
author = {Nancy A. Lynch and Michael Merritt and William Weihl
and Alan Fekete},
title = {Atomic Transactions},
publisher = {Morgan Kaufmann, San Mateo, CA},
year = {1994},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {[to read] :-)}
}
@article{Peled:1994:CFF,
author = {Doron Peled and Mathai Joseph},
title = {A Compositional Framework for Fault-tolerance by Specification
Transformation },
journal = {Theoretical Computer Science },
volume = {128 },
year = {1994 },
pages = {99--125 },
annote = "A fault-tolerant program is viewed as a fault-intolerant
program enhanced by some fault-tolerance/recovery mechanism (like
in \cite{Arora:1998:CDM,Arora:1998:DCT}). This can be viewed as a
program transformation, i.e. a function $T$ that maps a
fault-intolerant program $P$ to a fault-tolerant version
$P'=T(P)$. However, introducing a recovery mechanism alters the
original specification $S$ of $P$ to some augmented specification
$S'$ which takes the behavior of the tolerance mechanism into
account. So the effects of a tolerance mechanism on $S$ can be
regarded as a specifictation transformation $F$ which maps $S$ to
$S'=F(S)$. A program transformation $T$ and a specification
transformation $F$ correspond, if for all programs $P$ and for all
properties $p$, if $p$ holds for $P$ then $F(p)$ holds for $T(P)$.
Now, for some corresponding transformations $T$ and $F$, if some
property $p$ holds for $P$ and $F(p)$ implies property $q$, then
$q$ holds for $T(P)$. So properties about $T(P)$ can be proved
without looking at the code of $T(P)$ if $T$ and $F$ correspond.
A specification transformation $F$ is said to be compositionally
complete with respect to a program transformation $T$ if all
properties can be proved in this way. Criteria for compositional
completeness are given and depend on the monotonicity and the
expressiveness of the specification language. --- The methodology
is exemplified by the example of forward and backward recovery of
distributed computations. A recovery algorithm is proposed and its
corresponding specification transformation defined which is
divided in a fixed part (e.g., eventually a snapshot will be taken
(liveness) and the tolerance mechanism does not interfere with the
original computation (safety)) and a part depending on the
original specification. Then the basic program is transformed into
a fault-tolerant program $P'$ and then to a fault-tolerant program
in a faulty environment (the methods behind this is described in
\cite{Liu:1992:TPF}). Then some simple properties of the
fault-tolerant program are verified by applying the transformation
$F$ to them and using the above proof rule. These properties are
usually weaker since faults may deem the original properties
unachievable (they however do not say how to derive them in
general). The authors give criteria how to verify that $T$ and $F$
actually correspond. They also discuss modularity issues:
variables of the recovery algorithm can be omitted from the
specification by a method of concealment. Fairness is an open
problem since imposing a fault-tolerance mechanism and invoking it
on faults can destroy fairness guarantees that held for the
untransformed program. The paper uses interleaving semantics with
a formalism coming from the area of (temporal) logic and
concurrency. A fine paper."
}
@InProceedings{Ruget:1994:CMC,
title = "Cheaper Matrix Clocks",
author = "Fr{\'e}d{\'e}ric Ruget",
booktitle = pro-wdag94,
editor = "Gerard Tel and Paul M. B. Vit{\'a}nyi",
address = "Terschelling, The Netherlands",
month = "29~" # sep # "--1~" # oct,
year = "1994",
series = "Lecture Notes in Computer Science",
volume = "857",
publisher = pub-SV,
ISBN = "ISBN 3-540-58449-8",
pages = "355--369",
annote = "[to read]"
}
@article{Rushby:1994:CSP,
AUTHOR = {John Rushby},
TITLE = {Critical System Properties: Survey and Taxonomy},
JOURNAL = {Reliability Engineering and System Safety},
YEAR = 1994,
VOLUME = 43,
NUMBER = 2,
PAGES = {189--219},
annote = "Although quite long, this is a very insightful and
rewarding survey of various notions of ``critical systems''
from the broad literature. First, Rushby compares the four
distinct approaches to critical systems that have emerged:
(1) dependability/fault tolerance, (2) safety engineering,
(3) secure systems, (4) real time systems. The dependability
approach includes the usual notion of fault tolerance that a
system should not deviate from its system specification if
faults occur. The system specification can also be degraded
resulting in a well-defined failure behavior (or failure
semantics). The central method to achieve this is application
of redundancy. Faults are categorized in fault models or
failure semantics of subcomponents and there is a tradeoff
between the fault types and the number of faults that can
occur for a given level of redundancy. (``a quad-redundant
Byzantine fault-tolerant system can withstand a single fault
of any kind, whereas a differently organized quad-redundant
system can withstand as many as three crash faults, but no
other kind.'' [Hybrid models can help here.]) Managing
redundancy requires coordination, which is difficult. Method
to fight transient faults (self-stabilization) and design
faults are also discussed. (2) The safety engineering
approach is concerned with the occurrence of unplanned
events. Safety means here that the system does no harm of any
kind. Safety is achieved through hazard analysis (either
reasoning backwards from a catastrophe or reasoning forward
from a component failure). This can also be done for
software, resulting in software fault tree analysis
(SFTA). The advantage of this approach is that it explicitly
considers the system context. A ``fail-safe'' operation is
desired and achieved through a safe step-by-step operation
based on a notion of locks (``lockin'', ``lockout''). While
dependability ``tries to maximize the extent to which the
system works well'', safety engineering ``tries to minimize
the extent to which it can fail badly'' (p.13f). Thus
dependability is natural in circumstances in which there is
``no safe alternative to normal service'' (like in aircraft
control). (3) The secure systems approach holds up the
protection of secrets and privacy. This includes a notion of
integrity. Methods to achieve this are usually based on
kernelization. This is analogous to fault containment in
dependability. (4) The real-time systems approach needs to
ensure deadlines and ``jitter'' (i.e. a certain quality of
outputs). Real-time systems are organized as cyclic
executives of a fixed number of processes in a fixed schedule
(which has a number of disadvantages described on p. 20) or a
preemptive and priority driven schemes that dominate today
(especially a method called rate monotonic scheduling where
priorities are derived from iteration rates). Both methods
are compared on p. 23. There are relations especially between
hard-real-time and masking fault tolerance. In Chapter 3
Rushby surveys formal models for critical system properties
and assurance methods. These include formalizations of
security (via access control mechanisms), fault tolerance and
real time. Formal notions of properties are usually based on
traces (although security for example can be seen as a higher
level property, see p. 29). Fault tolerance formalizations
are either calculational (like \cite{Arora:1993:CCF}),
i.e.~they calculate the effects of faults and see whether
resulting executions are still ``safe'', or specificational,
i.e.~the fault-tolerance specification is composed of the
failure semantics of the subsystems (like
\cite{Herlihy:1991:SGD}). More references to the literature
are given on p.~30. Formalizations of real-time properties
are usually based on some form of temporal logic and model
checking (there are also versions of such logics that take
time intervals into account, see p.~33f). Assurance
techniques must take random and systematic failures into
account to calculate some reliabilility measure (which for
critical systems is in the order of $10^{-9}$ probability of
failure during one hour operation). Direct measurement and
testing is ruled out because of these high demands (testing
would require some 100.000 years to meet these
measures). Calculational approaches on the other hand contain
many (``only'', p.~37) subjective factors such as the
examination of the lifecycle process. Formal methods can be
used to guarantee formal correctness but nobody can give real
evidence for attaching some reliability number (this is a
good quote, p.~38). Finally, Rushby provides a taxonomy of
critical system properties based on interaction and coupling
which laxly said is the necessity of flexibility versus the
flexibility offered by the system. Overall this is one of my
top ten favourite papers because it offers an understandable
overview with well-chosen and well-explained examples,
written in fine language and without the usual academic
high-nose. The pages refer to the printed version from the
web page http://www.csl.sri.com/reports/html/csl-93-1.html"
}
@TechReport{Sabel:1994:SFS,
title = "Simulating Fail-Stop in Asynchronous Distributed
Systems",
author = "Laura S. Sabel and Keith Marzullo",
number = "TR94-1413",
year = "1994",
month = mar,
institution = "Cornell University, Computer Science Department",
pages = "24",
abstract = "The fail-stop failure model appears frequently in the
distributed systems literature. However, in an
asynchronous distributed system, the fail-stop model
cannot be implemented. In particular, it is impossible
to reliably detect crash failures in an asynchronous
system. In this paper, we show that it is possible to
specify and implement a failure model that is
indistinguishable from the fail-stop model from the
point of view of any process within an asynchronous
system. We give necessary conditions for a failure
model to be indistinguishable from the fail-stop model,
and derive lower bounds on the amount of process
replication needed to implement such a failure model.
We present a simple one-round protocol for implementing
one such failure model, which we call simulated
fail-stop.",
annote = "Published as \cite{Sabel:1994:SFA} and at
PoDC94. Not readily available on the net. See summary of
\cite{Sabel:1994:SFA}."
}
@InProceedings{Sabel:1994:SFA,
author = "Laura S. Sabel and Keith Marzullo",
title = "Simulating Fail-Stop in Asynchronous Distributed
Systems",
pages = "138--147",
booktitle = pro-srds94,
month = oct,
publisher = "IEEE Computer Society Press",
address = "Los Alamitos, Ca., USA",
year = "1994",
annote = "Abstract in \cite{Sabel:1994:SFS}. The authors
present a method how to ``implement'' the fail-stop
failure model in asynchronous environments. Because
this task is impossible, they give a version of a
failure model that is indistinguishable from
fail-stop and call it simulated fail-stop. The
system model is based on the asynchronous crash
model with reliable FIFO channels. Processes have a
local `crash' variable and a `failed' vector which
should reflect the `crash' values of all other
processes. They define the failed-before relation in
terms of these variables: i failed before j in a run
iff at j failed[i] is true and remains true in that
run. The indistinguishability of runs bases on the
definitions of \cite{Chandy:1986:HPL}. The fail-stop
failure model is defined using two conditions: (FS1)
A processes failure is eventually detected by all
processes that don't crash. (FS2) There are no false
detections. The authors derive three necessary
conditions for indistinguishability of FS: (C1) If a
process i detects the crash of a process j, then
eventually j will crash. (C2) The failed-before
relation is acyclic. (C3) A crash event happens
before no other event. These are not sufficient
conditions, as shown by a run that meets C1--C3 and
is distinguishable from FS. However, the authors
give another set of sufficient conditions which are
not all necessary: weakening FS1 is not possible
because this may prevent progress, so FS2 is
weakened into four condistions: (FS2a) If a process i
suspects the crash of process j, then eventually j
will crash; this in conjunction with FS1 this
implies C1. (FS2b) The failed-before relation is
acyclic; this is C2. (FS2c) A process never detects
its own failure. (FS2d) Once i detects the failure
of j, then all messages sent by i to any process k will
not be recived until k has also detected the failure
of j; c and d together imply C3. The authors give a
simple protocol that implements these
conditions. The central idea is to form an agreement
on the suspicions by using intersecting quorum sets
of processes. This mainly ensures C2. The size of
such a quorum set must be strictly greater that
$n(\frac{t-1}{t})$, where n is the number of
processes and t is the maximum number of processes
that may fail. The authors relate these results to
the failure detector hierarchy of
\cite{Chandra:1996:UFD}: the fail-stop model is
equivalent to having a perfect failure detector
(PFD), and the properties that are proposed are
those of a strong failure detector (SFD). So while a
PFD cannot be implemented by a SFD, an
indistinguishable failure detector can be
implemented. Here's a nice citation: ``A failure
model describes the manner in which the components
of a system can fail.'' (Sect. 3)"
}
@Article{Schepers:1994:TCP,
author = "Henk Schepers and Jozef Hooman",
title = "A trace-based compositional proof theory for fault
tolerant distributed systems",
journal = "Theoretical Computer Science",
volume = "128",
number = "1-2",
pages = "127--157",
day = "6",
month = jun,
year = "1994",
corpsource = "Dept. of Math. and Comput. Sci., Eindhoven Univ. of
Technol., Netherlands",
keywords = "alternating bit protocol; compositional formalism;
distributed processing; exceptional behaviour; failure
hypothesis; fault tolerant computing; fault tolerant
distributed systems; formal specification; formal
verification; input behaviour; network completeness;
output behaviour; reasoning; safety property
specification; software reliability; soundness; theorem
proving; trace-based compositional proof theory; triple
modular redundant system",
annote = "The authors introduce a rigorous formalism allowing to
prove safety properties of fault tolerant systems. This is done by
extending a formalism used to reason about normal behavior (such as
\cite{Hoare:1984:CSP}) with a single rule by which a component
specification is weakened to reflect its faulty
behavior. Prerequisite is a precise characterization of faulty
behavior, which is done using a reflexive relation on normal and
faulty traces. The method is specificational \cite{Rushby:1994:CSP}
and at the system interface level describing a specification
transformation. Examples (stuck at zero, message corruption, message
loss) are given. Formally, a failure hypothesis is a reflexive
relation on normal behavior, preserving prefix closure and effecting
only the components of the failed process. A failure hypothesis can
be used to derive the faulty behavior of a system. Examples which
are proved safe are TMR and the alternating bit protocol. The proof
system is shown to be sound and complete (didn't look at the
proofs). As said above, only safety properties are
handled. Compositional reasoning about liveness is difficult
\cite{Abadi:1993:CS}. Future work states that it would be nice to
have a logic to express failure hypotheses more
elegantly. \cite{Schepers:1993:CPT} extends this work to also cover
real time. Overall an interesting paper, probably the Journal
version of \cite{Schepers:1993:TFT}, citing all the prominent
players of the time
\cite{Joseph:1987:PRF,Liu:1993:SVR,Nordahl:1993:DFD,Peleska:1991:DVF,Weber:1989:FSF}
and the Conference version of \cite{Peled:1994:CFF}."
}
@InProceedings{Schiper:1994:PPV,
author = "A. Schiper and A. Sandoz",
title = "Primary Partition ``Virtually-synchronous
Communication'' Harder than Consensus",
series = ser-LNCS,
number = "857",
pages = "39--52",
year = "1994",
booktitle = pro-wdag94,
annote = "The authors formally define the primary partition
virtually synchronous communication problem (PP-VSC)
and show that it is harder to solve than consensus
in the sense that PP-VSC is solvable whenever
consensus is solvable but there are situations where
consensus is solvable and PP-VSC is not. PP-VSC
consists of 6 condistions that formalize the
following intuition: views are sets of processes. in
PP-VSC every process has the same view $V$ (as
opposed to the partial VSC problem). Assume that a
new view $V'$ has to be defined (because a process
from $V$ is assumed to have crashed for
example). Then all processes in both $V$ and $V'$
must have delivered the same set of messages in view
$V$ before delivering the new view $V'$. The system
model used is the asynchronous model enhanced with
failure suspectors as defined by Chandra and Toueg
\cite{Chandra:1996:UFD}."
}
@Article{Schiper:1994:SSP,
title = "Strong Stable Properties in Distributed Systems",
author = "Andr{\'e} Schiper and Alain Sandoz",
journal = j-DC,
pages = "93--103",
year = "1994",
volume = "8",
number = "2",
annote = "[to read]"
}
@Book{Schuessler:1994:DS,
author = {H. W. {Sch\"u\ss{}ler}},
ALTeditor = {},
title = {{Digitale Signalverarbeitung}},
publisher = pub-SV,
year = {1994},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Berlin},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {[Angabe von Armin]}
}
@Article{Schwarz:1994:DCR,
author = "Reinhard Schwarz and Friedemann Mattern",
title = {Detecting causal relationships in distributed
computations: in search of the holy grail},
journal = j-DC,
year = 1994,
OPTkey = {},
volume = 7,
OPTnumber = {},
OPTmonth = {},
pages = "149--174",
OPTnote = {},
annote = "A well written and extensive survey about the
intrinsic problems in detecting causal relationships
in distributed systems. First, causality and vector
time is explained and how both relate to the notion
of real time. Then implementation aspects of vector
time are discussed. Next, the authors focus on the
evaluation of global predicates and show that the
truth of such a predicate depends on the
observer. Different modalities of predicates are
surveyed (including the well known `possibly' and
`definitely') and present a few algorithms for
predicate detection. The bibliography section
contains 74 (!) references, so this paper can be
used as a starting point for own research. Overall,
the authors manage to show that dealing with
distributed systems is a complex and intriguing
undertaking."
}
@Book{Tel:1994:IDA,
author = {Gerard Tel},
title = {Introduction to Distributed Algorithms},
publisher = {Cambridge University Press},
year = 1994,
}
@Article{Walther:1994:OPT,
author = {Christoph Walther},
title = {On Proving the Termination of Algorithms by Machine},
journal = {Artifical Intelligence},
year = {1994},
volume = {7},
pages = {101--157},
annote = {Walther presents a method to prove the termination of a
class of normal sequential algorithms in a fully automatic
fashion. The algorithms are formulated in a functional
programming language and the idea behind this method seems to be
to derive a well-founded ordering relation on recursive calls by
some heuristics based on size reduction. His method produces
hypothesis suitable for proving with an automatic theorem
prover. The method handles only algorithms that ``strongly''
terminate (a definition I have not understood) and here not for
all strongly terminating ones. However, strong termination is a
practical restriction since all programs that do not have
recursive calls in the conditions of cases and do not have nested
recursive calls strongly terminate. The paper contains also an
overview over older work on (automatic) termination proofs, such
as a reference to Floyd's idea of termination functions
\cite{Floyd:1967:AMP}, the first mentioning of the term
``convergence function'' \cite{Manna:1974:AAT} and comparison
work of termination proving methods \cite{Katz:1975:CLT}. }
}
@Book{Yang:1994:GST,
editor = {Zhonghua Yang and T. Anthony Marsland},
title = {Global States and Time in Distributed Systems},
publisher = {IEEE Computer Society Press},
year = {1994},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {Contains all the classics on the subject, e.g.
\cite{Chandy:1985:DSD,Cooper:1991:CDG,Garg:1994:DWU,Mattern:1989:VTG}.}
}
@InCollection{??:1995:FLP,
author = "???",
title = "Summary of the discussion sessions: FLP and real time",
OPTcrossref = "",
OPTkey = "",
booktitle = "Theory and Practice in Distributed Systems",
publisher = pub-SV,
year = "1995",
editor = "K. P. Birman and F. Mattern and A. Schiper",
OPTvolume = "",
number = "938",
series = ser-LNCS,
OPTtype = "",
OPTchapter = "",
pages = "260--261",
OPTaddress = "",
OPTedition = "",
OPTmonth = "",
OPTnote = "",
annote = "Summary of a discussion session from the Dagstuhl
workshop on theory and practice of distributed
systems. Discusses ways to circumvent the FLP result
\cite{Fischer:1985:IDC} and the various notions of
real time in distributed systems. This includes a
mention of failure detectors, timing assumption
coverage, real-time scheduling."
}
@Article{Abadi:1995:CS,
author = "Mart{\'\i}n Abadi and Leslie Lamport",
title = "Conjoining Specifications",
journal = j-TOPLAS,
volume = "17",
number = "3",
pages = "507--534",
month = may,
year = "1995",
url = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/201069.html",
abstract = "We show how to specify components of concurrent
systems. The specification of a system is the
conjunction of its components' specifications.
Properties of the system are proved by reasoning about
its components. We consider both the decomposition of a
given system into parts, and the composition of given
parts to form a system.",
annote = "The basis of this and other research
\cite{Abadi:1993:CS,Lamport:1989:SAS} is that programs and their
specifications are formulas in a temporal logic (this idea is
attributed to Pnueli \cite{Pnueli:1981:TSC}). If specifications
allow stuttering steps, then $A\Rightarrow B$ asserts that $A$
implements $B$. So checking the correctness of a program can be
done within the logic. Parallel composition can then be seen as
conjunction. When dealing with composite systems there are two
cases to consider: (1) when starting with a composite
specification $M$ we want to decompose it into ``subcomponents''
$M_a$ and $M_b$ where $M_a\land M_b \Rightarrow M$. Decomposition
usually results in slight modifications (due to communication) of
$M_a$ and $M_b$ resulting in subcomponents $M_a^l$ and $M_b^l$. We
want to prove that $M_a^l\land M_b^l\Rightarrow M_a\land M_b$, but
unfortunately this involves reasoning about the full low level
protocol. Rather we could make use of the fact that we have a
decomposition and rather prove $M_a^l\Rightarrow M_a$ and
$M_b^l\Rightarrow M_b$ to prove our result. But this is not always
valid. The Decomposition Theorem on page 527 states that we can
deduce $M_a^l\land M_b^l\Rightarrow M_a\land M_b$ from three
things: (a) $E_a\land M_a^l\Rightarrow M_a$, (b) $E_b\land
M_b^l\Rightarrow M_b$, and (c) $M_a\land M_b\Rightarrow E_a\land
E_b$. (2) The second case to consider is when we start with a set
of subcomponents and want to reason about the specification of the
composed system. Given two components as an assumption/guarantee
specification $E_a\Rightarrow M_a$ and $E_b\Rightarrow M_b$, then
we would like to deduce that the composed system satisfies
$M_a\land M_b$ if one is taken as the environment of the
other. This reasoning is however only valid if $E_a$ and $E_b$ are
safety properties. This fact is discussed more elaborately in
\cite{Abadi:1993:CS}. The context in which this reasoning is done
is TLA \cite{Lamport:1994:TLA}."
}
@InProceedings{Alvarez:1995:ODA,
author = {Guillermo A. Alvarez and Flaviu Cristian and Shivakant
Mishra},
title = {on-demand asynchronous atomic broadcast},
booktitle = {Proceedings of the 5th IFIP Working Conference on
Dependable Computing and Critical Applications},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {1995},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Urbana-Champaign, IL},
month = sep,
OPTorganization = {},
OPTpublisher = {},
url = "ftp://ftp.cs.ucsd.edu/pub/grad/galvarez/papers/ondemand.ps.Z",
OPTnote = {},
annote = {Focusses on practical performance issues. [to read]}
}
@INPROCEEDINGS{Arora:1995:DMF,
AUTHOR = "Anish Arora and Sandeep S. Kulkarni",
TITLE = "Designing masking fault-tolerance via
nonmasking fault-tolerance",
BOOKTITLE = pro-srds95,
YEAR = 1995,
PAGES = "174--185",
annote = "Appeared later in the IEEE Transactions on Software
Engineering \cite{Arora:1998:DMF}."
}
@InProceedings{Arora:1995:ECC,
author = "Anish Arora and Mohamed Gouda",
title = "Load balancing: an exercise in constrained convergence",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "183--197",
booktitle = pro-wdag95,
year = "1995",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
OPTmonth = "",
URL = "ftp://ftp.cis.ohio-state.edu/pub/anish/papers/load-balancing.ps.gz",
annote = "Stepwise design of distributed load balancing
algorithms from specifications using the paradigm of
constrained convergence."
}
@InProceedings{Arora:1995:TBS,
author = "Anish Arora and David M. Poduska",
title = "A timing-based schema for stabilizing information exchange",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
OPTpages = "",
booktitle = "Proceedings of the Third International Conference on
Computer Networks, Tokyo, Japan",
year = "1995",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
OPTmonth = "",
OPTnote = "",
annote = "The authors construct a sequel of increasingly
complex stabilizing information exchange protocols:
first a solution for the adjacency problem is given,
which is extended to a connectivity protocol and
finally to a general information exchange protocol
that may be used to detect and establish certain
predicates in the system. This schema can be used to
implement commitment, leader election, spanning tree
construction (i.e., all locally checkable
specifications). It is closely related to the
paradigm of information propagation. Interesting is
the discussion of real time properties: the guarded
command notation is extended to specify real time
bounds on actions and methods for specifying and
proving timeliness properties of algorithms are
discussed. The underlying system model uses
synchronized clocks and channels with bounded
message delay."
}
@Article{Babaoglu:1995:SVD,
author = "{\"Ozalp} {Babao\u{g}lu} and Michel Raynal",
title = "Specification and Verification of Dynamic Properties
in Distributed Computations",
journal = "Journal of Parallel and Distributed Computing",
volume = "28",
number = "2",
pages = "173--185",
month = aug,
year = "1995",
keywords = "Boolean algebra; Boolean predicates;
causality-preserving order; classes; debugging;
distributed algorithms; distributed applications;
distributed computations; distributed systems; dynamic
property specification; dynamic property verification;
dynamic reconfiguration; formal; global predicate;
global system states; interval-constrained sequences;
program; program debugging; program testing; simple
sequences; specification; verification",
annote = "The authors investigate the specification and
detection of a new class of dynamic properties:
these are simple sequences (causality preserving
sequences of global states) and interval-constrained
sequences (simple sequences with undesired states in
the middle). They give algorithms that efficiently
detect these predicates based on the usual
construction algorithms of the lattice of global
states \cite{Cooper:1991:CDG}. The paper contains a
good analysis of the inherent costs of constructing
the lattice and detecting the predicates and relates
their (and others') methods to temporal logics. The
discussion section argues that increases expressive
power of the observable predicates will always
result in an increased cost of detecting it,
however, that the worst case analysis is not very
realistic since the communication patterns of for
example programs using RPC result in very lean
lattices."
}
@Article{Birman:1995:RTC,
author = {Kenneth P. Birman and Bradford B. Glade},
title = {Reliability through consistency},
journal = j-IEEE-SOFTWARE,
year = {1995},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
month = {May},
pages = {29--41},
OPTnote = {},
annote = "This paper argues that consistency is a key to fault
tolerant applications. In particular, consistent
failure reporting is important. Different levels of
consistency are defined (stabilization consistency
[i.e. the system stabilizes to a consistent state],
piecewise consistency [i.e. causal consistency], and
uniform consistency [i.e. atomic
consistency]). Current systems (such as Unix,
Chorus, Windows NT, DCE and CORBA, Mach, ISIS and
others) are assessed for their consistency
guarantees. Implementation difficulties are discussed."
}
@Book{Bishop:1995:NNP,
author = {Ch. M. Bishop},
ALTeditor = {},
title = {Neural Networks for Pattern Recognition},
publisher = {Clarendon-Press},
year = {1995},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Oxford},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {[Angabe von Armin]}
}
@TechReport{Chandra:1995:UFD,
author = "Tushar Chandra and Sam Toueg",
title = "Unreliable Failure Detectors for Reliable Distributed
Systems",
number = "TR95-1535",
year = "1995",
month = aug,
institution = "Cornell University, Computer Science Department",
pages = "51",
abstract = "We introduce the concept of unreliable failure
detectors and study how they can be used to solve
Consensus in asynchronous systems with crash failures.
We characterise unreliable failure detectors in terms
of two properties --- completeness and accuracy. We
show that Consensus can be solved even with unreliable
failure detectors that make an infinite number of
mistakes, and determine which ones can be used to solve
Consensus despite any number of crashes, and which ones
require a majority of correct processes. We prove that
Consensus and Atomic Broadcast are reducible to each
other in asynchronous systems with crash failures; thus
the above results also apply to Atomic Broadcast. A
companion paper shows that one of the failure detectors
introduced here is the weakest failure detector for
solving Consensus [CHT92].",
}
@Article{Charron-Bost:1995:LTP,
author = "Bernadette Charron-Bost and Carole Delporte-Gallet
and Hugues Fauconnier",
title = "Local and temporal predicates in distributed systems",
OPTcrossref = "",
OPTkey = "",
journal = j-TOPLAS,
year = "1995",
volume = "17",
number = "1",
pages = "157--179",
month = jan,
OPTnote = "",
annote = "This is an intrinsic paper combining general
knowledge theory and predicate detection in
distributed systems. The authors re-visit Cooper and
Marzullo's \cite{Cooper:1991:CDG} predicate
transformers `possibly' and `definitely',
investigate their properties and show how they
relate to the predicate transformer `process p knows
phi'. The also define the important notion of a
predicate being local to some process set (i.e. the
truth value depends only on the local states of that
set) and show that knowledge is local (i.e. local
predicates are knowledge predicates and vice
versa). The results show an interesting analogy
between knowledge predicates (which are local and
thus ``spatial'') and the temporal predicates
`possibly' and `definitely'. Also, a special type of
predicates (called `observer independent') is
investigated which are easily detectable: observer
independent predicates are such for which possibly
and definitely coincide. They show that a predicate
which is local to one process is observer
independent, as well as the disjunction of observer
independent predicates. Interestingly, these results
show that ``a process never forgets''. Overall, this
is a very formal, but nevertheless rewarding article
offering some surprising insights, but a little
lengthy missing a few ``real-world'' examples (see
the article by Haplern and Moses
\cite{Halpern:1990:KCK} for one with lots of
examples)."
}
@InProceedings{Chase:1995:EDR,
title = "Efficient Detection of Restricted Classes of Global
Predicates",
author = "Craig M. Chase and Vijay K. Garg",
booktitle = pro-wdag95,
editor = "Jean-Michel H{\'e}lary and Michel Raynal",
address = "Le Mont-Saint-Michel, France",
month = sep,
year = "1995",
series = ser-LNCS,
volume = "972",
publisher = pub-SV,
ISBN = "ISBN 3-540-60274-7",
pages = "303--317",
annote = "[to read]"
}
@Article{Cristian:1995:ABF,
title = "Atomic Broadcast: From Simple Message Diffusion to
{Byzantine} Agreement",
author = "Flaviu Cristian and Houtan Aghili and Ray Strong and
Danny Dolev",
pages = "158--179",
journal = "Information and Computation",
month = apr,
year = "1995",
volume = "118",
number = "1",
annote = "The authors present three timed atomic broadcast
algorithms with increasing fault tolerance
properties: (1) timestamped message diffusion based
on flooding, tolerant against a limited number of
crash/omission failures; (2) timestamped message
diffusion with hop count, tolerant against timing
failures; (3) timestamped message diffusion with hop
count and authentication, tolerant against
authentication detectable Byzantine failures. All
protocols provide timely dissemination up to network
partition. Two lower bounds prove that (1) the time
needed for atomic broadcast to terminate in a
network of diameter $x$ is $O(x)$ (limited number of
crash/omission failures, network stays connected);
(2) any atomic broadcast protocol with $n$
processors that tolerates $n-2$ authentication
detectable Byzantine processor failures cannot have
a termination time smaller than
$(n-1)\cdot\delta$. Conclusions contain references
to other work on atomic broadcast and shows the
alternative between diffusion based and
acknowledgement based protocols. The authors also
argue that bounded reaction time is incompatible
with partitions. The derivational presentation of
the algorithms reminds of \cite{Hadzilacos:1994:MAF}
and is very rewarding."
}
@Article{Dolev:1995:DFC,
author = {Danny Dolev and Joseph Y. Halpern and Barbara Simons and
Ray Strong},
title = {Dynamic Fault-Tolerant Clock Synchronization},
journal = J-ACM,
year = {1995},
OPTkey = {},
volume = {42},
number = {1},
pages = {143--185},
month = jan,
OPTnote = {},
annote = {Proposes a new algorithm for clock synchronization.
First gives a good overview over other algorithms: mostly they are
averaging methods reqiring $3f+1$ nodes or $2f+1$ if authentication
is available. There are also phase locking algorithms, where
nodes periodically broadcast their time and others set their
clock to that time. Assumptions are bounded drift rate between
local hardware clocks, and an upper bound on message transmission
time. A tolerance specification of linear envelope synchronization
is given on p. 150. The algorithm they give is late extended to
also handle processor joins, it can tolerate any number of faults
provided the correct processes stay connected. Overall a very
rigorous paper, gives a good impression of clock synchronization
up to today. }
}
@InProceedings{Dolev:1995:SCS,
author = "Shlomi Dolev and Jennifer L. Welch",
title = "Self-stabilizing clock synchronization in the presence
of {Byzantine} faults",
booktitle = "Proceedings of the Second Workshop on Self-Stabilizing
Systems",
pages = "9.1--9.12",
year = "1995",
annote = "It is known that clock synchronization in Byzantine
environments requires $3f+1$ processors if $f$ is
the number of faulty processors. Protocols exist for
these cases. In this paper the authors investigate
the problem under a more severe failure assumption:
apart from $f$ processors being faulty, any form of
transient faults may happen to the system. They
present two probabilistic protocols that synchronize
clocks in a system under these assumptions. In
effect, these protocols are self-stabilizing. The
protocols cause the local clocks to converge into a
given margin within time exponential to the total
number of processes. Because they investigate
arbitrary transient faults, they also use bounded
clocks that wrap around periodically. They also
present an interesting application of the Chinese
Remainder Theorem for implementing a distributed
counter."
}
@InCollection{Echtle:1995:TFT,
author = "Klaus Echtle and Martin Leu",
title = "Test of fault tolerant distributed systems by fault
injection",
OPTcrossref = "",
OPTkey = "",
booktitle = "Fault-Tolerant Parallel and Distributed Systems",
publisher = pub-IEEE,
year = "1995",
editor = "D. Pradhan and D. Avresky",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
OPTtype = "",
OPTchapter = "",
pages = "244--251",
OPTaddress = "",
OPTedition = "",
OPTmonth = "",
OPTnote = "",
annote = "This paper presents a method to efficiently produce
test cases for fault injection to test fault
tolerant algorithms for design faults. Usually, the
number of test cases is very large, because every
branch and all paths of a program must be
explored. Here, the number of test cases is reduced
by two methods: (1) instead of analyzing the full
program, an abstraction of it is considered. The
abstraction is modeled by a timed Petri net and
omits the description of nodes assumed to be
faulty. (2) Test cases are generated from this Petri
net by constructing the reachability graph and
semiautomatically cutting off paths that are
semantically unjustified (because for example timing
assumptions violate the given failure model). The
resulting test cases are in a sense ``complete'' and
significantly less than brute force approaches
yield. It is interesting how the behavior of faulty
nodes is modeled on the abstraction level: nothing
can be assumed about their behavior, resembling
Byzantine behavior. The test cases can subsequently
be used to test the implemented system and spare the
developer from tedious full verification of the
algorithm with all its low-level details. The work
in this paper is related to ground-breaking work of
Echtle in 1984 \cite{Echtle:1984:FSV}."
}
@InProceedings{Fetzer:1995:PCA,
author = "Christof Fetzer and Flaviu Cristian",
title = "On the possibility of consensus in asynchronous systems",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
OPTpages = "",
booktitle = "Proceedings of the 1995 Pacific Rim International
Symposium on Fault-Tolerant Systems",
year = "1995",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
month = dec,
OPTnote = "",
annote = "The authors show that consensus is possible in the
timed asynchronous system model together with the
``always eventually majority-stable progress
assumption''. They argue that this adequately reflects
todays networked workstations, i.e., that todays
networks are not completely asynchronous. The work
is related to other work that adds synchrony to the
time free model, claims to be closest to
\cite{Dwork:1988:CPP} (the ``global stabilization
model'') and does not relate in depth to
\cite{Chandra:1992:WFD} or \cite{Chandra:1991:UFD}
because ``the model considered there is time-free,
[it] assumes that properties of failure detectors
eventually always hold, and [it] does not include
processor restarts."
}
@InProceedings{Guerraoui:1995:NBA,
author = "Rachid Guerraoui and Mikel Larrea and {Andr\'e} Schiper",
title = "Non blocking atomic commitment with an unreliable
failure detector",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
OPTpages = "",
booktitle = pro-srds95,
year = "1995",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
month = sep,
OPTnote = "",
annote = "The authors present a solution to the non-blocking atomic
commitment problem in asynchronous systems using
failure detectors. A commit protocol is a consensus
protocol with favour of abort, and it is
non-blocking meaning that only all surviving members
need to commit. The authors adapt Chandra and
Toueg's consensus algorithm \cite{Chandra:1996:UFD}
to solve atomic commitment. Necessary prerequisites
for termination are therefore eventually weak
failure detectors and a majority of correct processes."
}
@InProceedings{Guerraoui:1995:RRB,
title = "Revisiting the Relationship Between Non-Blocking
Atomic Commitment and Consensus",
author = "Rachid Guerraoui",
booktitle = pro-wdag95,
editor = "Jean-Michel H{\'e}lary and Michel Raynal",
address = "Le Mont-Saint-Michel, France",
month = "13--15~" # sep,
year = "1995",
series = "Lecture Notes in Computer Science",
volume = "972",
publisher = pub-SV,
ISBN = "ISBN 3-540-60274-7",
pages = "87--100",
annote = "[to read]"
}
@InCollection{Guerraoui:1995:TMV,
author = "Rachid Guerraoui and {Andr\'e} Schiper",
title = "Transaction model vs virtual synchrony model:
bridging the gap",
OPTkey = "",
booktitle = "Theory and Practice in Distributed Systems",
publisher = pub-SV,
year = "1995",
editor = "K. P. Birman and F. Mattern and A. Schiper",
OPTvolume = "",
number = "938",
series = ser-LNCS,
OPTtype = "",
OPTchapter = "",
pages = "121--131",
OPTaddress = "",
OPTedition = "",
OPTmonth = "",
OPTnote = "",
annote = "[to read]"
}
@INPROCEEDINGS{Gouda:1995:TTS,
AUTHOR = "Mohamed G. Gouda",
TITLE = "The triumph and tribulation of system
stabilization",
BOOKTITLE = pro-wdag95,
YEAR = 1995,
PAGES = "1--18",
annote = "reviews 10 years of stabilization research."
}
@Article{Halpern:1995:RAK,
author = "Joseph Y. Halpern",
title = "Reasoning about Knowledge: {A} Survey",
editor = "D. M. Gabbay and C. J. Hogger and J. A. Robinson",
booktitle = "Handbook of Logic in Artificial Intelligence and Logic
Programming, Volume 4: Epistemic and Temporal
Reasoning",
pages = "1--34",
publisher = "Oxford University Press",
year = "1995",
annote = "[to read]"
}
@InProceedings{Isermann:1995:OFL,
author = {Rolf Isermann},
title = {On Fuzzy Logic Applications for Automatic Control,
Supervision and Fault Diagnosis},
booktitle = {Proceedings of the Third European Congress on Intelligent
Techniques and Soft Computing (EU-FIT)},
OPTcrossref = {},
OPTkey = {},
pages = {738--753},
year = {1995},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Aachen},
OPTmonth = {},
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {[Angabe von Armin]}
}
@InProceedings{Jegou:1995:LSA,
author = {Roland {J\'egou} and Raoul Medina and Lhouari Nourine},
title = {Linear space algorithm for on-line detection of global
predicates},
booktitle = {Proceedings of the International Workshop on Structures
in Concurrency Theory (STRICT)},
OPTcrossref = {},
OPTkey = {},
pages = {175--189},
year = {1995},
editor = {{J\"org} Desel},
OPTvolume = {},
OPTnumber = {},
series = {Workshops in Computing},
OPTaddress = {},
OPTmonth = {},
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {}
}
@InProceedings{Julier:1995:NAF,
author = {Simon J. Julier and Jeffrey K. Uhlmann and Hugh F.
Durrant-Whyte},
title = {A new approach for filtering nonlinear systems},
booktitle = {Proceedings of the 1995 American Control Conference},
OPTcrossref = {},
OPTkey = {},
pages = {1628--1632},
year = {1995},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Seattle, WA},
OPTmonth = {},
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {Presents a method to replace the extended Kalman filter
\cite{Welch:1995:IKF} by some tricky prediction mechanism
that doesn't require to calculate Jacobian matrices.}
}
@Misc{Ladkin:1995:340,
author = {Peter Ladkin},
title = {Re: A340 incident at {Heathrow} {(Hatton, RISKS-16.92)}},
howpublished = {The Risks Digest (Forum on Risks to the Public in Computers
and Related Systems)},
month = mar,
volume = 16,
number = 96,
year = {1995},
url = "\url{http://catless.ncl.ac.uk/Risks}",
OPTnote = {},
annote = {Describes in detail the Airbus A340 incident at Heathrow
in September 1994. During the approach, both display screens
in the cockpit went blank and displayed a message ``please
wait''. The pilots were still able to fly the plane, but
it's somewhat difficult without instrument feedback. The
autopilot, which was subsequently switched on, tuned into
a ``false glidescope'', a side effect of the radio beam
used for landing the aircraft under instrument conditions.
This caused the aircraft to fly unusually high pitch rates.
The pilots subsequently turned off the autopilot and
used a SRA (surveillance radar approach) where the plane
is ``talked'' down by the tower. They landed safely. Later
the logs of the computer system showed that there had been
near-to simultaneous faults in the two redundant flight
control systems leading to unexpected behavior (for
example, the system also wrongly complained that it
was low on fuel). Airbus Industries is said to be
aware that there are problems within the redundancy
management and that the failure of one computer can cause
a failure in the next.}
}
@Article{Lamport:1995:HWP,
author = "Leslie Lamport",
title = "How to write a proof",
journal = "American Mathematical Monthly",
volume = "102",
number = "7",
pages = "600--608",
month = aug # "\slash " # sep,
year = "1995",
url = "http://www.research.digital.com/SRC/personal/Leslie_Lamport/proofs/proofs.html",
annote = "A way of writing proofs is presented that ``makes it much
harder to prove things that are not true''. It is a structured
proof writing method similar to proof trees of interactive theorem
proving environments. The exposition and experience reports with
this method are delightful. Prior version appeared as DEC SRC
Research Report number 94"
}
@InProceedings{LeLann:1995:ORN,
author = "Gerard {Le Lann}",
title = "On real-time and non real-time distributed computing",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "972",
OPTseries = ser-LNCS,
pages = "51--70",
booktitle = pro-wdag95,
year = "1995",
OPTorganization = "",
OPTpublisher = pub-SV,
OPTaddress = "",
month = sep,
OPTnote = "Invited paper.",
annote = "The author explores the relationship between
problems in real-time and non real-time
computing. The distinction between both classes is
that real-time problems have a set of timeliness
constraints and their model has additional
restrictions on event releases. Timeliness
constraints are considered to be a composition of
safety and liveness properties. Two examples are
discussed: the asynchronous consensus problem (for
non real-time) and the hard real-time distributed
multiaccess channel problem. Both presentations,
especially that of the second example, are intricate
and tedious to understand. Finally, the author
identifies that timeliness constraints are related
to on-line scheduling strategies in the sense that
solutions in an asynchronous model may be
``immersed'' into real-time environments by adding
special scheduling algorithms. This corresponds to a
distinction between design and implementation
phases. The paper contains a reference to the
distinct phases of diffusion and decision in
asynchronous consensus and relates them to knowledge
theoretic terms such as partial common knowledge. It
also discusses real-time equivalents of an
eventually weak failure detector. Overall, a paper
with lots of ideas, many typos and typographical
shortcomings (obviously hastily produced) and lots
of passages which I do not grasp."
}
@InCollection{Liu:1995:FFF,
author = {Zhiming Liu and Mathai Joseph},
title = {A formal framework for fault-tolerant programs},
booktitle = {Mathematics of Dependable Computing},
OPTcrossref = {},
OPTkey = {},
pages = {131--148},
publisher = {Oxford University Press},
year = {1995},
editor = {C. M. Mitchell and V. Stavridou},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTtype = {},
OPTchapter = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {This is a book chapter incorporating ideas and results
from a few other papers by the same authors. The beginning is the
same as in \cite{Liu:1996:VFR}: program development is described as
a sequence of refinement steps; action systems, TLA, specifications
and the rest of the formalism is introduced. In contrast to
\cite{Liu:1996:VFR} here also liveness properties are
studied. Liveness properties result from imposing some fairness
condition on the specification. Refinement mappings are defined
along the lines of \cite{Lamport:1989:SAS}. Then faults and their
effects are studied (as in \cite{Liu:1996:VFR}): physical faults are
modeled as a set of actions which are scheduled concurrently with
regular program actions, i.e. faults are isolated/separated from the
program. Then the fault-tolerant refinement relation is discussed
(like in \cite{Liu:1996:VFR}) and the distinction between global and
local fault assumption (terms are attributed to Nordahl's thesis
\cite{Nordahl:1992:SDD}). Global fault assumptions are always safety
properties while local fault assumptions are safety and liveness
properties (specified by state transitions). It is shown (as in
\cite{Liu:1996:VFR}) that the global fault assumptions may be
integrated into the fault actions: the specification of the
fault-affected program is the conjunction of an (1) initial
property, (2) the state transitions of the program and the faults,
(3) the fault assumption and (4) the fairness property. The safety
properties (2) and (3) can be encoded in a new state transition
relation and thus are ``locally programmable''. Yes, separating
local from global fault assumptions makes it easier to specify fault
affected behaviors. But before proving the fault tolerance, the
global assumption should be integrated into the transition
system. Overall this is a version of \cite{Liu:1996:VFR} using the
same examples but discussing liveness issues and not touching
real-time. }
}
@Book{Manna:1995:TVR,
author = {Zohar Manna and Amir Pnueli},
title = {Temporal verification of reactive systems: safety},
publisher = pub-SV,
year = {1995},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {See also \cite{Manna:1991:TLR}. Where's liveness?}
}
@InCollection{Mattern:1995:NLI,
author = "Friedemann Mattern and Stefan {F\"unfrocken}",
title = "A non-blocking lightweight implementation of causal
order message delivery",
OPTcrossref = "",
OPTkey = "",
booktitle = "Theory and Practice in Distributed Systems",
publisher = pub-SV,
year = "1995",
editor = "K. P. Birman and F. Mattern and A. Schiper",
OPTvolume = "",
number = "938",
series = ser-LNCS,
OPTtype = "",
OPTchapter = "",
pages = "197--213",
OPTaddress = "",
OPTedition = "",
OPTmonth = "",
OPTnote = "",
annote = "An implementation of causal order delivery using low
level FIFO buffers. Excludes some computations but
is very efficient."
}
@Book{Neumann:1995:CRR,
author = {Peter G. Neumann},
title = {Computer Related Risks},
publisher = {ACM Press},
year = {1995},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {A great collection of computer related incidents from many
areas (defense, space, aviation, etc.) affecting
reliability, safety and security together with cause
analysis and a discussion about technical and social
countermeasures. A good source of information and also
a starting point for more because of the good references
(especially the RISKS forum).}
}
@TechReport{Sabel:1995:EVC,
author = "Laura S. Sabel and Keith Marzullo",
title = "Election Vs. Consensus in Asynchronous Systems",
number = "TR95-1488",
year = "1995",
month = feb,
institution = "Cornell University, Computer Science Department",
pages = "9",
abstract = "It was shown in 1985 that the {\em Consensus problem}
cannot be solved in an asynchronous system if even a
single crash failure can occur. In this paper, we show
that there are other problems that cannot be solved in
an asynchronous system, and for the same intuitive
reason: it is impossible to distinguish a very slow
processor from a crashed processor. However, these
problems are harder than Consensus, in that there are
contexts in which Consensus can be solved but these
other problems cannot. More precisely, the weakest
failure detector that is needed to solve these problems
is a Perfect Failure Detector, which is strictly
stronger than the weakest failure detector that is
needed to solve Consensus. We use a formulation of the
Election problem as the prototype for these problems
that are harder than Consensus.",
annote = "Contains a good and concise definition of failure
detectors \`a la Chandra and Toueg
\cite{Chandra:1996:UFD} in terms of temporal
logic. The proof idea is as follows: a failure
detector has very weak completeness iff eventually
every process that crashes is suspected at least
once by some correct process. The authors then show
that (1) strong accuracy and very weak completeness are
necessary to solve election, and (2) that both
together are sufficient. This shows that a strongly
complete and very weakly accurate failure detector
is the weakest failure detector necessary for
election. Very weak completeness and strong accuracy
however suffice to implement a perfect failure
detector. Thus the weakest failure detector for
election is stronger than the weakest failure
detector for consensus. Thus, election is harder
than consensus. Other problems as hard as election
are primary backup and (probably) terminating
reliable broadcast. "
}
@Article{Singhal:1995:OPA,
author = "Mukesh Singhal and Friedemann Mattern",
title = "An optimality proof for asynchronous recovery
algorithms in distributed systems",
journal = j-IPL,
volume = "55",
number = "3",
pages = "117--121",
day = "11",
month = aug,
year = "1995",
keywords = "Algorithms; asynchronous recovery; Asynchronous
recovery algorithms; Computation theory; Computer
networks; Computer simulation; Computer system
recovery; Consistent cut; consistent cut; Data
communication systems; Data processing; Distributed
computer systems; distributed processing; distributed
systems; Internal events; Message receive events;
Message send events; Optimality proof; optimality
proof; roll backs; system recovery",
treatment = "T Theoretical or Mathematical",
annote = "[to read]"
}
@InProceedings{Stoller:1995:FPD,
title = "Faster Possibility Detection by Combining Two
Approaches",
author = "Scott D. Stoller and Fred B. Schneider",
booktitle = pro-wdag95,
OPTeditor = "Jean-Michel H{\'e}lary and Michel Raynal",
month = sep,
year = "1995",
OPTseries = ser-LNCS,
OPTvolume = "972",
OPTpublisher = pub-SV,
OPTISBN = "ISBN 3-540-60274-7",
pages = "318--332",
annote = "The main contribution of this paper is the best
in-depth investigation of the complexity of
possibility detection so far. The general algorithms
by Cooper an Marzullo \cite{Cooper:1991:CDG} have
worst case time complexity of $\Omega(S^N)$ where
$N$ is the number of processes and $S$ is the
maximum number of relevant events on every
process. This is because \emph{every} consistent
global state has to be investigated. However, as
shown for example by Garg and Waldecker
\cite{Garg:1994:DWU}, one can do better for
restricted types of predicates. In this paper, the
authors show an interesting decomposition property
of the set of global consistent states and an
application to possibility detection: a state $g$ is
globally consistent iff for any subset $F$ of
processes (1) the restriction of $g$ to $F$ is
concurrent to the restriction of $g$ to the
complement of $F$, and (2) the restriction of $g$ to
$F$ is a consistent global state in the computation
restricted to $F$, and (3) the restriction of $g$ to
the complement of $F$ is a consistent global state
in the computation restricted to the complement of
$F$. The idea now is to reformulate the detection
predicate and to specialize it with respect to some
subset $F$ of processes. Then, possibly(P) is
equivalent to choosing a set $F$ of processes,
choosing a constistent global state $g$ of the
computation restricted to $F$, and checking whether
possibly(P') holds in the computation restricted to
the complement of $F$, where P' denotes the
predicate P specialized to $g$. (Uff!) Now, having
such a fixed set $F$, a standard algorithm for
possibility detection can be run in ``smaller''
computations, but this has to be done as many times
as the computation restricted to $F$ has consistent
global states. So, the complexity of the resulting
algorithm depends on $|F|$ and is $O(S^{|F|+1})$
which is better than usual whenever
$|F|<N-1$. However, finding a minimal fixed set is
shown to be NP-complete and so only approximations
help in general (there is some work to be done
here). The authors additionally show that amoung all
formulas equivalent to $P$, the disjunctive normal
form (DNF) has minimum cost for possiblity detection
(every disjunct can be detected seperately). A few
enhancements are discussed, example applications are
given and some funny matrix multiplication method is
presented for off-line possibility
detection. Finally, a well-written section discusses
the inherent complexity of detecting possibly and
gives some good references. Overall, this is a paper
that at some points supersedes my own abstraction
bounds and swims in a theoretical ocean which is
very wide."
}
@Article{Verissimo:1995:QSS,
author = {Paulo Ver{\'\i}ssimo and Carlos Almeida},
title = {Quasi-synchronism: a step away from the traditional
fault-tolerant real-time system models},
journal = {Bulletin of the Technical Committee on Operating Systems
and Application Environments (TCOS)},
year = {1995},
OPTkey = {},
volume = {7},
number = {4},
pages = {35--39},
OPTmonth = {},
OPTnote = {},
annote = {The ideas herein appear in a more general and elaborate
form in \cite{Almeida:1998:QSA}.}
}
@TechReport{Welch:1995:IKF,
author = {Greg Welch and Gary Bishop},
title = {An Introduction to the {Kalman} filter},
institution = {University of North Carolina at Chapel Hill,
Department of Computer Science},
year = {1995},
OPTkey = {},
OPTtype = {},
number = {TR 95-041},
address = {Chapel Hill, NC 27599-3175},
OPTmonth = {},
OPTnote = {},
annote = {This paper provides an introduction to the concept
of a Kalman filter for the non-expert. A Kalman filter can be used
to estimate the state of a discrete linear process in noisy
environments. If the process is non-linear, an extended Kalman
filter is used that assumes linearity on intervals of process
behavior. The extended Kalman filter requires calculating the
Jacobian matrix of derivates of the process modeling function.
A new approach to filtering nonlinear systems that does not
require calculating these matrices is described in
\cite{Julier:1995:NAF}.}
}
@Article{Wiederhold:1995:MIS,
author = "Gio Wiederhold",
title = "Mediation in Information Systems",
journal = j-ACM-COMP-SURVEYS,
volume = "27",
number = "2",
pages = "265--267",
month = jun,
year = "1995",
url = "http://www.acm.org/pubs/toc/Abstracts/0360-0300/210390.html",
annote = "discusses sensor/actuator approach [to read]"
}
@InProceedings{Zhou:1995:FNP,
author = "Jianying Zhou and Dieter Gollmann",
title = "A Fair Non-repudiation Protocol",
keywords = "non-repudiation, trusted third party",
pages = "55--61",
year = "1996",
booktitle = "Proceedings of the IEEE Symposium on Security and Privacy",
address = "Oakland, CA",
year = "1996",
publisher = pub-IEEE,
month = may,
organization = "IEEE Computer Society,Technical Committee on Security
and Privacy",
series = "Research in Security and Privacy",
annote ="something like active exchange \cite{Buerk:1990:VES}, [to get]"
}
@InProceedings{Aguilera:1996:RFD,
title = "Randomization and Failure Detection: {A} Hybrid
Approach to Solve Consensus",
author = "Marcos Kawazoe Aguilera and Sam Toueg",
booktitle = pro-wdag96,
editor = "{\"O}zalp Babaoglu and Keith Marzullo",
address = "Bologna, Italy",
month = "9--11~" # oct,
year = "1996",
series = "Lecture Notes in Computer Science",
volume = "1151",
publisher = pub-SV,
ISBN = "ISBN 3-540-61769-8",
pages = "29--39",
annote = "[to read]"
}
@InProceedings{Almeida:1996:TFD,
author = {Carlos Almeida and Paulo Ver{\'\i}ssimo},
title = {Timing Failure Detection and Real-Time Group
Communication in Quasi-Synchronous Systems },
booktitle = {Proceedings of the 8th Euromicro Workshop on
Real-Time Systems},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {1996},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {L'Aquila, Italy},
month = jun,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {A newer and more elaborate treatment of this topis
can be found in \cite{Almeida:1998:QSA}.}
}
@Article{Arora:1996:CSB,
author = "Anish Arora and Mohamed G. Gouda and George Varghese",
title = "Constraint satisfaction as a basis for designing
nonmasking fault-tolerant systems",
OPTcrossref = "Arora:1994:CSB",
OPTkey = "",
journal = "Journal of High Speed Networks",
year = "1996",
volume = "5",
number = "3",
pages = "293--306",
OPTmonth = "",
OPTnote = "A preliminary version appeared at ICDCS94.",
annote = "Probably the same as \cite{Arora:1994:CSB} but more
citeable."
}
@InProceedings{Ayache:1996:FMV,
author = "S. Ayache and E. Conquet and P. Humbert and C.
Rodriguez and J. Sifakis and R. Gerlich",
title = "Formal Methods for the Validation of Fault Tolerance
in Autonomous Spacecraft",
pages = "353--359",
ISBN = "0-8186-7261-7",
booktitle = pro-ftcs96,
month = jun # "25--27~",
publisher = "IEEE",
address = "Washington",
year = "1996",
annote = "[to read]"
}
@Article{Babaoglu:1996:UFS,
author = "{\"Ozalp} {Babao\u{g}lu} and Eddy Fromentin and
Michel Raynal",
title = "A unified framework for the specification and
run-time detection of dynamic properties in
distributed computations",
OPTcrossref = "",
OPTkey = "",
journal = "Journal of Systems and Software",
year = "1996",
volume = "33",
OPTnumber = "",
pages = "287--298",
OPTmonth = "",
OPTnote = "",
annote = "The authors present a general framework with which
to detect a large class of properties of distributed
computations. Abstractly, property detection can be
seen as searching through an evolving directed
acyclic graph (DAG) which labelled nodes. If the
nodes carry labels according to specific properties,
the detection problem can be formulated as an
instance of the language recognition problem. This
counts for all properties that are expressible as
regular languages. The framework can be used to
detect properties of computations based on sequences
of local states (control flows). It can also be used
to detect properties defined on sequences of
consistent global states. Thus, the method is a
generalization of the property detection approaches
of Cooper and Marzullo \cite{Cooper:1991:CDG} for
possibly and definitely
\cite{Babaoglu:1993:CGS,Schwarz:1994:DCR}. The
detection methods are based on mapping an accepting
automaton onto the nodes. For properties of control
flow it suffices to add an array of bits (of the
order of the set of states of the accepting
automaton) to every node and message and have a
distributed controller running and updating the
array at every node. If the larger class of
properties on sequences of global states is to be
detected, the authors employ a central monitoring
process (like in \cite{Cooper:1991:CDG}) that
incrementally constructs the lattice of consistent
states. The nodes of the lattice are an array of
bits (one for every state of the accepting
automaton). While the previous approach is
practically feasible and implemented (in the EREBUS
distributed debugger process mentioned in the
acticle), the detection of sequences of global
states seems to be infeasible. However, properties
on single global states (like possibly and
definitely) do not need the expressibility of
regular languages and some detecting these sorts of
predicates can be feasible in practice. The authors
see their method as a sort of on-the-fly model
checking that have no idea of the model they are
checking against. Overall, I like this paper very
much: it is concise and mathematically sound, uses a
minimal set of examples ans strives for theoretical
excellence."
}
@Book{Barbosa:1996:IDA,
author = "Valmir C. Barbosa",
title = "An Introduction to Distributed Algorithms",
publisher = "MIT Press",
address = "Cambridge, MA",
year = "1996",
keywords = "book, text, parallel processing, supercomputers,
computer algorithms,",
}
@TechReport{Basu:1996:SPP,
title = "Solving Problems in the Presence of Process Crashes
and Lossy Links",
author = "Anindya Basu and Bernadette Charron-Bost and Sam
Toueg",
year = "1996",
month = sep,
pages = "30",
institution = "Cornell University, Computer Science Department",
number = "TR96-1609",
abstract = "We study the effect of link failures on the
solvability of problems in asynchronous systems that
are subject to process crashes: given a problem that
can be solved in a system with process crashes and
reliable links, is the problem solvable even if links
are lossy? We answer this question for two types of
lossy links, and show that the answer depends on the
maximum number of processes that may crash and the
nature of the problem to be solved. In particular, we
prove that the answer is positive if fewer than half of
the processes may crash or if the problem specification
does not refer to the state of processes that crash.
However, in general, the answer is negative even if
each link can loose only a finite number of
messages.",
annote = "A shorter version appeared at WDAG-10
\cite{Basu:1996:SRL} which is summarized there and
not in this bibliographic entry."
}
@InProceedings{Basu:1996:SRL,
author = "Anindya Basu and Bernadette Charron-Bost and Sam Toueg",
title = "Simulating reliable links with unreliable links in
the presence of process crashes",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "105--122",
booktitle = pro-wdag96,
year = "1996",
OPTorganization = "",
publisher = pub-SV,
address = "Bologna, Italy",
month = oct,
OPTnote = "",
annote = "The authors investigate the question, what problems
that are solvable with reliable links and possible
process crashes remain solvable in the presence of
unreliable links. They investigate two types of
unreliable links: eventually reliable (there is a
time after which the link becomes reliable = finite
message loss), and fair lossy (if an infinite number
of messages is sent over a channel, then an infinite
number of messages is received at the other end =
infinite message loss). Intuitively, a reliable link
is also eventually reliable, and an eventually
reliable link is also fair lossy. The authors show
two things: (1) there are problems (e.g. uniform
reliable broadcast) that are solvable with reliable
channels but are not solvable with eventually reliable
channels. This means that, in general, eventually
reliable links cannot simulate reliable links. (2)
if the majority of processes in the system is
correct, then fair lossy links can simulate reliable
links. The key idea behind this fact is that
processes must infinitely often diffuse their
message histories. This is however very inefficient
(requires unbounded storage capacity in nodes and
unbounded message length). In general, this is a
paper which reveals again the importance of
correct-restricted problems (problems in which only
correct processes are required to do something),
because correct-restricted problems remain solvable
even with fair lossy links."
}
@InProceedings{Beauquier:1996:MFH,
author = {Joffroy Beauquier and {Synn\"ove} Kekkonen},
title = {Making {FTSS} is hard},
booktitle = {Proceedings of the International Conference on Software
Engineering (ICSE'96)},
OPTcrossref = {},
OPTkey = {},
pages = {91--96},
year = {1996},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Las Vegas, USA},
month = jul,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {Possibly similar to brief announcement \cite{Beauquier:1997:OFS}. See also \cite{Beauquier:1997:FTS,Kekkonen:1998:RFA}.}
}
@InProceedings{Camp:1996:AAT,
author = {Jean Camp and Micheal Harkavy and J. D. Tygar and
Bennet Yee},
title = {Anonymous atomic transactions},
booktitle = {Proceedings of the 2nd USENIX Workshop on Electronic
Commerce},
OPTcrossref = {},
OPTkey = {},
pages = {123--133},
year = {1996},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
month = nov,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {[to write]}
}
@InProceedings{Chandra:1996:IGM,
author = "Tushar Deepak Chandra and Vassos Hadzilacos and Sam
Toueg and Bernadette Charron-Bost",
title = "On the Impossibility of Group Membership",
pages = "322--330",
booktitle = pro-podc96,
ISBN = "0-89791-800-2",
month = may,
publisher = "ACM",
address = "New York, USA",
year = "1996",
OPTnote = "Also published as Technical Report TR95-1548,
Cornell University.",
annote = "The authors show, that the problem of weak group
membership (WGM) is impossible in the asynchronous
system model used by Fischer et. al. in their famous
impossibility proof of consensus
\cite{Fischer:1985:IDC}. WGM is defined having two
properties: (liveness) if processes want to leave the
group, at least one other process must install a new
view of the group and no process installs a view
different from it; (safety) it must be possible that the
new view installed is correct. Impossibility of WGM
is especially noteworthy because it is at the core
of many group communication systems (e.g. Isis and
Transis). That's what makes this paper worthwile
citing."
}
@Article{Chandra:1996:UFD,
author = "Tushar Deepak Chandra and Sam Toueg",
title = "Unreliable failure detectors for reliable
distributed systems",
OPTcrossref = "",
OPTkey = "",
journal = j-ACM,
year = "1996",
volume = "43",
number = "2",
pages = "225--267",
month = mar,
OPTnote = "",
annote = "Journal version of \cite{Chandra:1991:UFD}."
}
@Article{Chandra:1996:WFD,
author = "Tushar Deepak Chandra and Vassos Hadzilacos and Sam
Toueg",
title = "The weakest failure detector for solving consensus",
OPTcrossref = "",
OPTkey = "",
journal = j-ACM,
year = "1996",
volume = "43",
number = "4",
pages = "685--722",
month = jul,
OPTnote = "",
annote = "Journal version of \cite{Chandra:1992:WFD}."
}
@InProceedings{Charpentier:1996:ACR,
author = "Michel Charpentier and Mamoun Filali and Philippe
Mauran and {G\'erard} Padiou and Philippe {Qu\'einnec}",
title = "Abstracting communication to reason about
distributed algorithms",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "89--103",
booktitle = pro-wdag96,
year = "1996",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
OPTmonth = "",
OPTnote = "",
annote = "Concurrent programming formalisms like UNITY are
often based on locally shared variables as
communication primitives. While it is possible to
extend these models via definitions with message
passing primitives this is quite cumbersome. In this
paper the authors propose an abstraction of
communication based on observations: variable $x$ observes
$y$ if updates of $x$ reflect all the updates of $y$
but not necessarily in a tiemly manner, i.e.,
eventually $x$ will take on all values of $y$ in the
original order. Using this observation relation on
variables, the authors present inference rules for
the UNITY framework that can be used to prove that
distributed algorithms have certain properties. The
observation relation is interesting because it
abstracts from communication and a communication
topology and thus acts like a transport layer of
communication subsystems. No relations to knowledge
based protocol formalisms are discussed, although
they seem to have fundamental similarities."
}
@Article{Charron-Bost:1996:SAC,
author = {Bernadette Charron-Bost and Friedemann Mattern and
Gerard Tel},
title = {Synchronous, asynchronous, and causally ordered
communication},
journal = j-DC,
year = 1996,
volume = 9,
pages = "173--191",
OPTannote = {}
}
@InProceedings{Cristian:1996:GMS,
author = "Flaviu Cristian",
title = "Group, Majority, and Strict Agreement in Timed
Asynchronous Distributed Systems",
pages = "178--189",
ISBN = "0-8186-7261-7",
booktitle = "Proceedings of the Twenty-Sixth International
Symposium on Fault-Tolerant Computing",
month = jun # "25--27~",
publisher = "IEEE",
address = "Washington",
year = "1996",
annote = "[to read]"
}
@Article{Cristian:1996:SAG,
author = "Flaviu Cristian",
title = "Synchronous and Asynchronous Group Communication",
journal = j-CACM,
volume = "39",
number = "4",
pages = "88--97",
month = apr,
year = "1996",
subject = "{\bf H.5.3}: Information Systems, INFORMATION
INTERFACES AND PRESENTATION, Group and Organization
Interfaces, Asynchronous interaction. {\bf H.5.3}:
Information Systems, INFORMATION INTERFACES AND
PRESENTATION, Group and Organization Interfaces,
Synchronous interaction.",
annote = "[to read]",
}
@InProceedings{Dega:1996:RMA,
author = "Jean-Louis Dega",
title = "The redundancy mechanisms of the {Ariane} 5
operational control center",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "382--386",
booktitle = pro-ftcs96,
year = "1996",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
OPTmonth = "",
OPTnote = "",
annote = "Dega reports on details of the ground control system
of the Ariane 5 project. The system is a fully
distributed real-time system built on top of an
off-the-shelf real-time operating system. It
controls the countdown procedure until 3 seconds
before the launch. The main system components are
duplicated twice (using hot/warm standby) and can be
repaired on-line. The main dependability requirement
is to be fail-safe, i.e. in case of critical
failures the system should stop in a safe state. The
probability of a serious event occuring during was
aimed to be $10^{-6}$. Main failure detection
functions are performed by the components themselves
(self-checking). The time constraints to hand over
from active to standby machine are less than 300
ms. The design of the system is another example for
safety being more important than liveness in
practical systems (see \cite{Kreitz:1998:SWL}.) See
also the notes on redundancy in a report on current
NASA work \cite{Marcopulos:1998:FBC}."
}
@TechReport{Dolev:1996:FDO,
author = "Danny Dolev and Roy Friedman and Idit Keidar and
Dahlia Malkhi",
title = "Failure detectors in omission failure environments",
institution = "Cornell University, Computer Science Department",
year = "1996",
OPTcrossref = "",
OPTkey = "",
OPTtype = "",
number = "TR96-1608",
OPTaddress = "",
month = sep,
OPTnote = "",
annote = "[to read] studies also partitions, surveyed in
\cite{Aguilera:1998:FDC}. Published as a brief announcement
at PoDC 97 \cite{Dolev:1997:FDO}."
}
@Misc{ESA:1996:A5F,
OPTkey = {},
author = {{European Space Agency}},
title = {ARIANE 5 Flight 501 Failure},
howpublished = {\url{http://www.esrin.esa.it/htdocs/tidc/Press/Press96/ariane5rep.html}},
month = jul,
year = {1996},
note = {Report by the Inquiry Board},
OPTannote = {}
}
@inproceedings{Fetzer:1996:FAT,
author = {Christof Fetzer and Flaviu Cristian},
title = {Fail-Awareness in Timed Asynchronous Systems},
booktitle = pro-podc96,
year = {1996},
month = {May},
address = {Philadelphia},
pages = {314--321a},
note = {\url{http://www-cse.ucsd.edu/users/cfetzer/FA/fa.html}},
annote = "Shows how to transform a synchronous specification $S$
into a weakened specification $F$ which is implementable in
timed-asynchronous systems. A synchronous specification is one
which prescribes a real-time deadline for completion of the
service. See \cite{Cristian:1999:TAD} for an explanation of
the timed asynchronous system model."
}
@InProceedings{Fetzer:1996:FFD,
author = "Christof Fetzer and Flaviu Cristian",
title = "Fail-Aware Failure Detectors",
pages = "200--209",
booktitle = "Proceedings of the 15th Symposium on Reliable
Distributed Systems ({SRDS} 1996)",
ISBN = "0-8186-7481-4",
month = oct,
publisher = "IEEE Computer Society Press",
address = "Los Alamitos, Ca., USA",
year = "1996",
annote = "The authors report on the contradiction that
election has been proved requiring a perfect failure
detector \cite{Sabel:1995:EVC} but election seems
implementable in existing asynchronous systems
\cite{Fetzer:1995:PCA}. To resolve this
contradiction, they introduce a new class of
fail-aware failure detectors which together with
certain progress assumptions \cite{Fetzer:1995:PCA}
are sufficient to solve election and are
implementable in timed asynchronous systems
\cite{Cristian:1998:TAS}. Fail-aware failure detectors
are based on the idea that a process suspects itself
immediately if it is suspected by another process
(strong fail-awareness) or by a majority of other
processes (weak fail awareness). Together with the
strong completeness and eventual weak accuracy of
Chandra and Toueg \cite{Chandra:1996:UFD} both
attributes are sufficient to solve election in
asynchronous systems. The failure detectors have
infinite output domains and this resemble very much
those of \cite{Aguilera:1997:HTF}. The
fail-awareness property can be implemented in timed
asynchronous systems only when making progress
assumptions \cite{Fetzer:1995:PCA} which assume
strict synchrony for a ``sufficiently long'' period
of time. Also, these are the first failure detectors
that actually reference real time in their
specifications, which is a little confusing when
designing algorithms for the time-free
model. However, exact formal definitions in terms of
\cite{Chandra:1996:UFD} are given but the full
consequences of the definition in terms of whether
``new'' and previously undiscovered features are
added are not discussed in depth."
}
@Book{Gabriel:1996:POS,
author = "Richard P. Gabriel",
title = "Patterns of {Software}. {Tales} from the {Software}
{Community}",
publisher = "Oxford University Press",
year = "1996",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
address = "New York, Oxford",
OPTedition = "",
OPTmonth = "",
OPTnote = "",
annote = "This is a striking, sometimes shocking and sometimes
amusing collection of essays by a man who (as one of
the developers of Lisp) has closely been related to
the emergence of high level programming languages
and the entire software engineering discipline for
about 20 to 30 years. Gabriel not only gives
inspiring insights into the benefits of small
systems ({\`a} la Siefkes), good documentation ({\`a}
la Knuth's literate programming) and what makes a
programming language good vs. widely accepted, but
also tells instructive tales about the rise and fall
of his own company, Lucid, during the 90s. A well
readable book written in almost spoken language and
with sometimes a little ``diffusing'' sequences of
ideas, but with a lot of perfectly arguable points,
which makes this book a good starting point for
discussions on software engineering (along with
Brook's ``No Silver Bullet'' \cite{Brooks:1987:NSB})."
}
@Article{Garg:1996:DSU,
author = {V. K. Garg and Brian Waldecker},
title = {Detection of strong unstable predicates in
distributed programs},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = {1996},
OPTkey = {},
volume = {7},
number = {12},
OPTmonth = {},
pages = {1323--1333},
OPTnote = {},
annote = "Angaben aus \cite{Stoller:1997:DGP}."
}
@Book{Garg:1996:PDS,
author = {Vijay K. Garg},
title = {Principles of Distributed Systems},
publisher = {Kluwer Academic Publishers},
year = {1996},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Boston, MA},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
OPTannote = {}
}
@InProceedings{Guerraoui:1996:CS,
author = "R. Guerraoui and A. Schiper",
title = "Consensus Service: {A} Modular Approach for Building
Agreement Protocols in Distributed Systems",
pages = "168--177",
ISBN = "0-8186-7261-7",
booktitle = pro-ftcs96,
month = jun # "25--27~",
publisher = "IEEE",
address = "Washington",
year = "1996",
annote = "[to read]"
}
@InProceedings{Guerraoui:1996:GAF,
title = "``{Gamma}-Accurate'' Failure Detectors",
author = "Rachid Guerraoui and Andr{\'e} Schiper",
booktitle = "Distributed Algorithms, 10th International Workshop,
{WDAG} '96",
editor = "{\"O}zalp Babaoglu and Keith Marzullo",
address = "Bologna, Italy",
month = "9--11~" # oct,
year = "1996",
series = "Lecture Notes in Computer Science",
volume = "1151",
publisher = pub-SV,
ISBN = "ISBN 3-540-61769-8",
pages = "269--286",
annote = "[to read]"
}
@PhdThesis{Hoefling:1996:MFP,
author = {T. {H\"ofling}},
title = {{Methoden zur Fehlererkennung mit Parametersch\"atzung und
Parit\"atsgeleichungen}},
school = {Technische Hochschule Darmstadt},
year = {1996},
OPTkey = {},
OPTtype = {},
OPTaddress = {},
OPTmonth = {},
OPTnote = {{erschienen als Fortschr. Ber. VDL, VDI-Verlag,
D\"usseldorf}},
OPTannote = {}
}
@InProceedings{Hurfin:1996:ODC,
author = "Michel Hurfin and M. Mizuno and Michel
Raynal and M. Singhal",
title = "On-the-Fly Detection of Conjunctions of Local
Predicates in Distributed Computations",
pages = "589--592",
booktitle = "Eighth {IEEE} Symposium on Parallel and Distributed
Processing ({SPDP}'96)",
ISBN = "0-8186-7683-3",
month = oct,
publisher = "IEEE Computer Society",
address = "Washington",
year = "1996",
annote = "[to get]"
}
@Article{Hutter:1996:VSE,
author = {Dieter Hutter and Bruno Langenstein and Claus Sengler
and {J\"org} H. Siekmann and Werner Stephan and
Andreas Wolpers},
title = {Verification Support Environment {(VSE)}},
journal = {High Integrity Systems},
year = {1996},
OPTkey = {},
volume = {1},
number = {6},
pages = {523--530},
OPTmonth = {},
OPTnote = {},
annote = {Gives an overview of VSE-I. Good reference. For VSE-II,
better cite \cite{Hutter:1998:VSE}.}
}
@Article{Isermann:1996:MUE,
author = {Rolf Isermann},
title = {{Modellgest\"utzte \"Uberwachung und Fehlerdiagnose
technischer Systeme}},
journal = {Automatisierungstechnische Praxis},
year = {1996},
OPTkey = {},
volume = {38},
OPTnumber = {},
pages = {9--20, 48--57},
OPTmonth = {},
OPTnote = {},
annote = {[Angabe von Armin]}
}
@phdthesis{Janowski:1996:BAF,
author = "T. Janowski",
title = "Bisimulation and Fault-Tolerance",
number = "CS-THESIS-JANOWSKI96",
year = "1996",
month = "February",
type = "Thesis",
url = "http://www.dcs.warwick.ac.uk/pub/reports/theses/jan96.html",
school = "Department of Computer Science, University of Warwick",
address = "Coventry, UK",
note = {Also University of Warwick Department of Computer
Science Research Report CS-RR-300},
abstract = { In the area of concurrent, communicating systems, a
common approach to verify the absence of design faults is in
terms of an equivalence relation between a high-level and a
low-level process. One such relation is bisimulation and this
holds if two processes cannot be distinguished by observing them
for a finite interval of time. However, the absence of design
faults does not guarantee that the process will behave correctly
in practice as it depends on various hardware devices which may
be subject to physical faults themselves. Such faults cannot be
avoided; they must be tolerated. The purpose of this thesis is
to provide a formal framework, based on bisimulations and using
the Calculus of Communicating Systems, by which we can specify,
design and verify concurrent, fault-tolerant systems, with
emphasis placed on reasoning and design under weak assumptions
about faults. },
annote = {[to get, requested from Warwick]}
}
@InProceedings{Liu:1996:VFR,
author = "Z. Liu and M. Joseph",
title = "Verification of Fault Tolerance and Real Time",
pages = "220--229",
ISBN = "0-8186-7261-7",
booktitle = pro-ftcs96,
month = jun,
publisher = "IEEE",
address = "Sendai, Japan",
year = "1996",
annote = "Programs and specifications are viewed as formulas in the
same logic (originally an idea of \cite{Pnueli:1981:TSC} explained
in \cite{Abadi:1995:CS,Abadi:1993:CS,Lamport:1989:SAS}). The logic
used here is TLA \cite{Lamport:1994:TLA} and the programming
notation are action systems (i.e. simple automata). Both formalisms
are related and it is shown how to transform action systems into
TLA. Here, only TLA formulas are studied which are safety
properties, i.e. only safety properties of action systems are
discussed. The development of a program $P$ from a specification
$S$ can be viewed as a sequence of refinement steps $P<P_n<\ldots
P_1=S$ starting with $S$ and ending with $P$ where in each step the
lower level version of the program is shown to implement the higher
level version (this is done using some refinement calculus,
e.g. \cite{Abadi:1991:ERM}). A program $P$ which implements $S$ in
fault-free operations may not do so in the presence of physical
faults. Faults are modeled as a set $f$ of fault operations on the
system state and the effect of faults is viewed as a transformation
$F(P,f)$ which is an interleaved execution of $P$ and $f$. The
transformed program is called the ``$f$-affected'' version of
$P$. If the $f$-affected version of $P$ satisfies some property $q$
and $q$ is the specification of some program $P'$ then $P$ is the
$f$-tolerant refinement of $P'$, denoted $P<_f P'$. The relation
$<_f$ is stronger that the ordinary refinement relation $<$ and
generally is not reflexive (why?). But it is somewhat transitive: if
$P_1<_{f_1}P_2$ and $P_2<_{f_2}P_3$ then $P_1<_{f_1}P_3$ ! Apart
from an actions set $f$ a fault model requires a behavioral
specification called ``behavioral fault assumption''. This is
analogous to the rely specification of
\cite{Voelzer:1998:VFT}. Generally, this is a safety property (as
conjectured in \cite{Gaertner:1999:ESD}) so it can be
``implemented'' within $f$. Separation of fault actions and
behavioral fault assumption usually makes specification
easier. Proving the fault tolerance properties of some program
results in proving that a program is a fault tolerant refinement of
another. Real time is basicly handled by adding a clock and
formulating real time properties as safety properties. Section 5
discusses related work: \cite{Liu:1992:TPF} presents methods how to
obtain fault-tolerant refinements of programs, other work is
\cite{Liu:1994:SDF}. It is noted that these methods can be used to
prove fault-tolerant algorithms using PVS. Transformational
approaches are independent of formalism (\cite{Nordahl:1993:DFD}
uses CSP, \cite{Janowski:1996:BAF} uses CCS."
}
@Book{Lynch:1996:DA,
author = {Nancy Lynch},
title = {Distributed Algorithms},
publisher = {Morgan Kaufmann, San Mateo, CA},
year = {1996}
}
@INPROCEEDINGS{Mizuno:1996:TBT,
AUTHOR = "Masaaki Mizuno and Hirotsugu Kakugawa",
TITLE = "A timestamp based transformation of self-stabilizing
programs for distributed computing environments",
BOOKTITLE = pro-wdag96,
YEAR = 1996,
PAGES = "304--321",
annote = "In the serial model, an atomic execution step
consists of a read-sub-step, where processes read
the state of their neighbours, followed by a local
state change. Each process can always see the states
of one of its neighbours and only one process at a
time executes an atomic step. In the distributed
model, an atomic execution step is either a
read-sub-step or a local state change based on its
own state and the locally recorded neighbours'
states. In this paper the authors present a method
to transform an algorithm from the serial model to
an algorithm from the distributed model and show
that the self-stabilization property is preserved
during transformation. The idea of the scheme is to
simulate the serial model by imposing a
transaction-commit protocol on every execution step
of the original algorithm. As an execution step
corresponds to a transaction, a lot of theorems from
serializability theory may be applied. The criterion
to prove serializability of the transformed program
bases on timestamps from Lamport logical
clocks. Correctness and message complexity depends
on the usual prerequisites of reaching consensus and
the carefull choice of timeout values."
}
@inproceedings{Owre:1996:PVS,
TITLE = {{PVS}: Combining Specification, Proof Checking, and
Model Checking},
AUTHOR = {S. Owre and S. Rajan and J.M. Rushby and N. Shankar
and M.K. Srivas},
BOOKTITLE = {Computer-Aided Verification, CAV '96},
EDITOR = {Rajeev Alur and Thomas A. Henzinger},
PAGES = {411--414},
PUBLISHER = pub-SV,
SERIES = {Lecture Notes in Computer Science},
NUMBER = 1102,
MONTH = {July/August},
YEAR = 1996,
ADDRESS = {New Brunswick, NJ}
}
@Book{Spies:1996:FSS,
ALTauthor = {},
editor = {Katharina Spies and Manfred Broy and Stephan Merz},
title = {Formal Systems Specification: The RPC-Memory Specification
Case Study},
publisher = pub-SV,
year = {1996},
OPTkey = {},
OPTvolume = {},
number = {1169},
series = ser-LNCS,
OPTaddress = {},
OPTedition = {},
month = dec,
OPTnote = {},
annote = {A collection of papers from a Dagstuhl seminar 9439 in
1994 where a sample poblem is specified and verified
in many different formalisms. There's also a Dagstuhl
report with abstracts on this.}
}
@Book{Tanenbaum:1996:CN,
author = "Andrew S. Tanenbaum",
title = "Computer Networks",
publisher = pub-PH,
year = "1996",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
address = pub-PH:adr,
edition = "Third",
OPTmonth = "",
OPTnote = "",
annote = "The well-known bestseller."
}
@InProceedings{Vogels:1996:WWF,
author = {Werner Vogels},
title = {World Wide Failures},
booktitle = {Proceedings of the ACM SIGOPS European Workshop},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {1996},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Connemara, Ireland},
month = sep,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {Discusses practical concerns in bulding reliable failure
detectors. Explicitly references \cite{Chandra:1996:UFD}
and presents some timeout measurements. It is said that
a full paper with detailed results will appear soon.
Has ist already? A good complement to
\cite{Sergent:1999:FDI}.}
}
@InProceedings{Zhou:1996:FNP,
author = "Jianying Zhou and Dieter Gollmann",
title = "A Fair Non-repudiation Protocol",
keywords = "non-repudiation, trusted third party",
pages = "55--61",
year = "1996",
booktitle = "Proceedings of the IEEE Symposium on Research in Security
and Privacy",
address = "Oakland, CA",
year = "1996",
publisher = pub-IEEE,
month = may,
organization = "IEEE Computer Society,Technical Committee on Security
and Privacy",
annote = "[to get]"
}
@InProceedings{Afek:1997:LS,
author = "Yehuda Afek and Shlomi Dolev",
title = "Local Stabilizer",
booktitle = pro-podc97,
pages = "287--?",
year = "1997",
annote = "The authors present a protocol module which can be
imposed onto arbitrary round based algorithms and
turn it into a self-stabilizing algorithm. This is
done much in the spirit of Katz and Perry
\cite{Katz:1993:SEM} by using a detection protocol
and a repair protocol. The detection protocol part
sends the complete state of a node to all its
neighbours in every round. After $d$ rounds, and by
forwarding states from neighbours, every node will
be able to construct a pyramid of local snapshots of
all nodes within diameter $d$ of itself. Level $k$
of the pyramid reflects the state of $k$-distant
node before $k$ rounds. Not only the state is
forwarded, but also the inputs to the node before
that round, so a remote node can check what the node
in question was supposed to be doing and can detect
inconsistencies immediately. On detecting an
inconsistency, the repair mechanism freezes the
outer network and diffuses the ``right'' state to
all processes within the infected portion of the
network. In case this is not possible (because the
majority of nodes has been perturbed for example), a
reset procedure is invoked. This paper contains some
very clever ideas that have a lot of potential for
optimization. The pyramid of states however and
checking the consistency implies that every node
does all the computations of all other
nodes. Together with the round based model this
implements an omniscient observer at every node that
takes snapshots at the beginning of every
round. Because of the round synchronization, these
snapshots must be identical at every node. Thus
inconsistencies can be detected. Stabilization time
is fast, but a huge amount of space needed."
}
@InProceedings{Aguilera:1997:HTF,
author = "Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg",
title = "Heartbeat: a timeout-free failure detector for
quiescent reliable communication",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "1320",
OPTseries = ser-LNCS,
pages = "126--140",
booktitle = pro-WDAG97,
year = "1997",
OPTorganization = "",
OPTpublisher = "Springer",
OPTaddress = "",
month = sep,
OPTnote = "",
annote = "The authors consider the problem of reliable
communication within quiescent algorithms,
i.e. algorithms that eventually stop sending
messages, in asynchronous systems with lossy
links. They solve the problem using a novel failure
detector called `heartbeat'. This failure detector
is a vector of size $n$ within each node, where $n$
is the number of neighbours the node has (one entry
per neighbour). The value of slot $i$ increments if
an alive signal (message) has been received by
neighbour $i$. It is shown that reliable
communication can be achieved in such settings using
heartbeat but it seems that the problems of timeouts
and synchrony are moved one level downwards. The
authors argue that this is okay since the failure
detector may be shared by other system modules and
there is no `terminating' version of failure
detectors anyway. The authors claim that heartbeat
is implementable and give evidence in which they use
the term ``periodically'' instead of
``timeout''. The main novelty with heartbeat is that
it has an infinite range, i.e. it outputs infinite
values (in contrast to previous versions that output
finite lists of suspects). Apart from this last
point, this paper is a good
starting point for finding literature on failure
detection."
}
@TechReport{Aguilera:1997:QRC,
title = "Quiescent Reliable Communication and Quiescent
Consensus in Partitionable Networks",
author = "Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg",
year = "1997",
institution = "Cornell University, Computer Science Department",
month = jun,
pages = "24",
number = "TR97-1632",
abstract = "We consider partitionable networks with process
crashes and lossy links, and focus on the problems of
reliable communication and consensus for such networks.
For both problems we seek algorithms that are
quiescent, i.e., algorithms that eventually stop
sending messages. We first tackle the problem of
reliable communication for partitionable networks by
extending the results of [ACT97a]. In particular, we
generalize the specification of the heartbeat failure
detector HB, show how to implement it, and show how to
use it to achieve quiescent reliable communication. We
then turn our attention to the problem of consensus for
partitionable networks. We first show that, even though
this problem can be solved using a natural extension of
<>S, such solutions are not quiescent --- in other
words, <>S alone is not sufficient to achieve quiescent
consensus in partitionable networks. We then solve this
problem using <>S and the quiescent reliable
communication primitives that we developed in the first
part of the paper. Our model of failure detectors for
partitionable networks, a natural extension of the
model in [CT96], is also a contribution of this
paper.",
annote = "See the Journal version \cite{Aguilera:1999:UHF}."
}
@TechReport{Aguilera:1997:WFD,
title = "On the Weakest Failure Detector for Quiescent Reliable
Communication",
author = "Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg",
year = "1997",
month = jul,
pages = "16",
number = "TR97-1640",
institution = "Cornell University, Computer Science Department",
abstract = "We consider the problem of achieving reliable
communication with quiescent algorithms (i.e.,
algorithms that eventually stop sending messages) in
asynchronous systems with process crashes and lossy
links, and show that, among failure detectors with
bounded output size, <>P is the weakest one that can be
used to solve this problem. Combined with a result in
[ACT97a], this shows that failure detectors that are
commonly used in practice, i.e., those that output
lists of suspects, are not always the best ones to
solve a problem.",
annote = "[to read]"
}
@Article{Alur:1997:TAA,
title = "Time-Adaptive Algorithms for Synchronization",
author = "Rajeev Alur and Hagit Attiya and Gadi Taubenfeld",
pages = "539--556",
journal = "SIAM Journal on Computing",
month = apr,
year = "1997",
volume = "26",
number = "2",
annote = "Proves that time is insignificant to safety properties.
Referenced in \cite{Merritt:1998:FSO}. Looks at consensus
and mutual exclusion in shared memory environments that have
an unknown upper bound on memory access times."
}
@Misc{Arora:1997:OCC,
OPTkey = {},
author = {Anish Arora and Mohamed G. Gouda},
title = {On the correctness criteria of load balancing programs},
howpublished = {Internet: ftp://ftp.cis.ohio-state.edu/pub/anish/papers/load-balancing.ps.gz},
month = apr,
year = {1997},
note = {},
annote = {revised version of \cite{Arora:1995:ECC}, submitted to IEEE TPDS.}
}
@InProceedings{Asokan:1997:OPF,
author = "N. Asokan and M. Schunter and M. Waidner",
title = "Optimistic Protocols for Fair Exchange",
pages = "8--17",
booktitle = "4th {ACM} Conference on Computer and Communications
Security",
address = "Zurich, Switzerland",
year = "1997",
publisher = "ACM Press",
month = apr,
editor = "Tsutomu Matsumoto",
annote = "[to get]"
}
@Article{Avizienis:1997:TSD,
author = "Algirdas Avizienis",
title = "Toward Systematic Design of Fault-Tolerant Systems",
journal = j-IEEE-COMPUTER,
volume = "30",
number = "4",
pages = "51--58",
month = apr,
year = "1997",
annote = "[to read]"
}
@TechReport{Babaoglu:1997:PGM,
author = "{\"O}zalp {Babao\u{g}lu} and Renzo Davoli and Albert
Montresor",
title = "Partitionable group membership: specification and
algorithms",
institution = "Department of Computer Science, University of
Bologna, Italy",
year = "1997",
OPTcrossref = "",
OPTkey = "",
OPTtype = "",
number = "UBLCS-97-1",
OPTaddress = "",
month = jan,
note = "Revised May 1997.",
OPTannote = "[to read]"
}
@InProceedings{Beauquier:1997:OFS,
title = "On {FTSS}-Solvable Distributed Problems",
author = "Joffroy Beauquier and Synn{\"o}ve Kekkonen-Moneta",
pages = "290",
booktitle = "Proceedings of the Sixteenth Annual {ACM} Symposium on
Principles of Distributed Computing",
address = "Santa Barbara, California",
month = "21--24~" # aug,
year = "1997",
annote = "Brief announcement at PODC, 1 page only. See also
\cite{Beauquier:1996:MFH,Beauquier:1997:FTS,Kekkonen:1998:RFA}."
}
@Article{Beauquier:1997:FTS,
author = {Joffroy Beauquier and {Synn\"ove} Kekkonen-Moneta},
title = {Fault-tolerance and self-stabilization: impossibility
results and solutions using self-stabilizing failure
detectors},
journal = {International Journal of System Science},
year = {1997},
OPTkey = {},
volume = {28},
number = {11},
pages = {1177--1187},
OPTmonth = {},
OPTnote = {},
annote = {A rounding-up paper of previous work in fault-tolerance
and self-stabilization started with \cite{Gopal:1993:USF} and
\cite{Anagnostou:1993:TTP}. The authors show that the
transformation of a fault-tolerant protocol into a fault-tolerant
self-stabilizing (ftss) protocol (performed in synchronous
environments in \cite{Gopal:1993:USF}) cannot be extended to
asynchronous environments because it is impossible to distinguish
a slow process from a crashed one. Then, they show that the
impossibility result of \cite{Anagnostou:1993:TTP} (which also
rests on the necessity to distinguish a crashed from a slow
process) can be extended to a class of network. These results can
be circumvented by adding some synchrony to the model in the form
of failure detectors (in the spirit of \cite{Chandra:1996:UFD}).
The synchrony assumption here is called ``fair communication'',
meaning that a correct process can receive only finitely many
messages from any one correct neighbor before receiving a message
from every other correct neighbor. (Processes are assumed to emit
a message to every neighbor at every tick of their local clock.)
This seems to be equivalent to a combination of stabilizing clock
drift and stabilizing transmission delay. The authors give
implementations for failure detectors based on this assumption
for both cases whether or not the bound is know or only the time
until it holds is unknown or not (in the spirit of partial
synchrony \cite{Dwork:1988:CPP}). The ideas herein are exposed
more elaborately in Kekkonen-Moneta's thesis
\cite{Kekkonen:1998:RFA}.}
}
@PhdThesis{Borcherding:1997:AEB,
author = {Malte Borcherding},
title = {{Authentifikationsvoraussetzungen f\"ur effiziente
byzantinische \"Ubereinstimmung}},
school = {Universit\"at Karlsruhe, Fakult\"at f\"ur Informatik},
year = {1997},
OPTkey = {},
OPTtype = {},
OPTaddress = {},
OPTmonth = {},
note = {Logos-Verlag, Berlin},
annote = {Stellt mehrere Zwischengrade der Authentifikation vor,
die effizientere Agreement-Algorithmen ermoeglichen.
Implizit Definition einer deutschen Terminologie fuer
Uebereinstimmungsprobleme.}
}
@Article{Chen:1997:FRC,
author = "Biao Chen and Sanjay Kamat and Wei Zhao",
title = "Fault-Tolerant, Real-Time Communication in
{FDDI}-Based Networks",
journal = j-IEEE-COMPUTER,
volume = "30",
number = "4",
pages = "83--90",
month = apr,
year = "1997",
annote = "[to read]"
}
@InProceedings{Dolev:1997:FDO,
author = {Danny Dolev and Roy Friedmann and Idit Keidar and
Dahlia Malkhi},
title = {Failure detectors in omission failure environments},
booktitle = pro-podc97,
OPTcrossref = {},
OPTkey = {},
OPTpages = {186},
year = {1997},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTmonth = {},
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
OPTannote = {This is a brief announcement (1 page) at PODC97. See
also the technical report version \cite{Dolev:1996:FDO}
[to read]}
}
@Article{Dolev:1997:PIS,
author = "Shlomi Dolev",
title = "Possible and impossible self-stabilizing digital clock
synchronization in general graphs",
journal = "Journal of Real-Time Systems",
volume = "12",
number = "1",
year = "1997",
pages = "95--107",
annote = "This paper contains a good general survey of clock
synchronization in shared-memory multiprocessor
systems with a general communication graph."
}
@Article{Dolev:1997:SRR,
title = "Self-Stabilizing Routing and Related Protocols",
author = "Shlomi Dolev",
pages = "122--127",
journal = "Journal of Parallel and Distributed Computing",
year = "1997",
volume = "42",
number = "2",
annote = "[to read]"
}
@TechReport{Doudou:1997:MDC,
author = "Assai Doudou and {Andr\'e} Schiper",
title = "Muteness detectors for consensus with {Byzantine}
processes",
institution = "EPFL -- {D\'epartement} d'Informatique, Lausanne,
Switzerland",
year = "1997",
OPTcrossref = "",
OPTkey = "",
OPTtype = "",
OPTnumber = "TR-97/230",
OPTaddress = "",
OPTmonth = oct,
OPTnote = "",
annote = "The authors extend the notion of failure detectors
to the Byzantine failure model. Generally, a
Byzantine process can do four things: (1) ignore
another process, or (2) send garbled messages to
another process, or (3) send messages that seem
correct to another process but do not follow the
protocol, or (4) skip protocol messages. To combat
this type of faulty behavior the authors present a
muteness failure detector. A process i is mute to a
process j if there is a time after which i crashes,
or i stops sending messages to j, or i sends only
incorrectly signed messages or unsigned messages to
j. Based on this definition, the muteness detector
is defined in terms of the traditional eventual weak
accuracy property and mute completeness, stating
that eventually every process i which is mute to
process j is permanently suspected by j. The
muteness detector guards against (1) and (2). The
behaviors (3) and (4) can be detected and corrected
by usual methods to solve Byzantine agreement
\cite{Lamport:1982:BGP}. Channels must be FIFO to be
able to detect missing messages (previous solutions
required causal message delivery
\cite{Malkhi:1997:UID}). The authors adapt the
consensus specification to Byzantine environments
(resulting in a definition of the vector consensus
problem) and give an algorithm that uses the
muteness detector to achieve consensus in a
Byzantine environment. The algorithm is based on the
early consensus algorithm by Schiper
\cite{Schiper:1997:ECA}. There's a mentioning of the
echo broadcast technique for solving the Byzantine
Generals Problem \cite{Lamport:1982:BGP} with signed
messages."
}
@InProceedings{Fetzer:1997:FAA,
author = "Christof Fetzer and Flaviu Cristian",
title = "Fail-Awareness: An Approach to Construct Fail-Safe
Applications",
pages = "282--291",
booktitle = "Proceedings of The Twenty-Seventh Annual International
Symposium on Fault-Tolerant Computing ({FTCS}'97)",
ISBN = "0-8186-7831-3",
month = jun,
publisher = "IEEE",
year = "1997",
annote = "The authors introduce the notion of fail-awareness
as an approach to construct fail-safe
applications. Fail-awareness is based on the idea
that the underlying system is timed asynchronous,
i.e., it is synchronous with a bound on timeliness
and failure rate most of the time, and asynchronous
in special cases. If such asynchronous phases can be
detected, the affected parts of the system must
switch into an exception mode that signals this fact
to clients. In this way, the system may degenerate
in a safe way. If synchronous performance is
re-established, services may re-join and catch up
again. Fail-awareness can be used to transform
synchronous service specifications so that they
become implementable in timed asynchronous
systems. The detection of timeliness properties is
based on synchronized clocks. A hierarchy of
fail-aware services is presented. Overall, this
paper is very dense and much of the details of
protocols are left to other references which should
be read to be convincing. It is another example that
detection is a prerequisite of fail-safe or masking
fault tolerance."
}
@inproceedings{Fetzer:1997:FAD,
author = {Christof Fetzer and Flaviu Cristian},
title = {A Fail-Aware Datagram Service},
booktitle = {Proceedings of the 2nd Annual Workshop on Fault-Tolerant Parallel and Distributed Systems},
year = {1997},
month = {Apr},
address = {Geneva, Switzerland},
note = {\url{http://www-cse.ucsd.edu/users/cfetzer/FADS/fads.html}},
annote = "[to read]"
}
@InProceedings{Fetzer:1997:TAA,
author = "Christof Fetzer and Shivakant Mishra and Flaviu
Cristian",
title = "The Timewheel Asynchronous Atomic Broadcast Protocol",
booktitle = "International Conference on Parallel and Distributed
Processing Techniques and Applications (PDPTA'97)",
publisher = "IEEE",
address = "Las Vegas, Nevada, USA.",
month = jun,
year = "1997",
abstract = "http://www.cps.udayton.edu/\~{}pan/pdpta.",
annote = "Presents a collection of several total order broadcast protocols.
Ordering can be unordered, total or time order, atomicity can be
weak, strong or strict. Focus is on performance issues, unlike
\cite{Hadzilacos:1994:MAF}. I think there's also aomething called
timewheel group membership."
}
@InProceedings{Franklin:1997:FES,
author = "Matthew K. Franklin and Michael K. Reiter",
title = "Fair Exchange with a Semi-Trusted Third Party",
pages = "1--5",
booktitle = "4th {ACM} Conference on Computer and Communications
Security",
address = "Z{\"u}rich, Switzerland",
year = "1997",
publisher = pub-ACM,
month = apr,
editor = "Tsutomu Matsumoto",
annote = "active exchange? [to get]"
}
@Misc{Gaertner:1997:FRD,
OPTcrossref = "",
OPTkey = "",
author = {Felix {G\"artner}},
title = "Fehlertolerante {Replikation} von {Diensten} mit
schwacher {Konsistenz} mittels selbststabilisierender
verteilter {Algorithmen}",
howpublished = {Diplomarbeit DA-BS-1997-06 am Fachgebiet
Betriebssysteme des Fachbereichs
Informatik, Technische Universit"at Darmstadt},
year = "1997",
month = "December",
note = {Internet:
\texttt{http://www.informatik.tu-darmstadt.de/\-$\tilde{}$felix/diploma}},
OPTannote = ""
}
@InProceedings{Garg:1997:OCD,
author = "Vijay K. Garg",
title = "Observation and control for debugging distributed
computations",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "1--12",
booktitle = "3rd Int. Workshop on Automated Debugging (AADEBUG
97)",
year = "1997",
OPTorganization = "",
OPTpublisher = "",
address = "Link{\"o}ping, Sweden",
url = "\url{http://www.ep.liu.se/ea/cis/1997/009/}",
month = may,
OPTnote = "keynote presentation",
annote = "As one of the ``big men'' in theory of distributed
systems, Garg presents here an overview over the
topics of observation and control of distributed
computations. Objective of control is to either
maintain an invariant on a global state or to ensure
a proper order of events. Observation is used to
monitor system actions. Three restrictions impose
problems on observation: (1) the lack of shared
clock can be alliviated by substituting causality
for real time and detecting predicates transformed
using `possibly' and `definitely'. Possibly true
predicates are useful for detecting bad conditions,
whereas definitely true predicates are useful to
verify the occurence of good predicates. (2) The
lack of shared memory can be alliviated by using the
notion of monotonicity. A predicate is monotone with
resprect to a variable if monotonic changing of that
variable doesn't change the truth of the
predicate. This allows us to restrict our attention
to state intervals rather than states. This allows
us to redice the number of events that must be
inspected drastically. (3) Combinatorial explosion
is combatted by the use of linear predicates. In
general, detecting possibly is NP-complete. However,
linear predicates can be detected efficiently: a
predictate is linear if its value `false' can be
detected ``locally'' (i.e., it contains a forbidden
state of a process or channel). So conjunctions of
local predicates can be efficiently detected. The
paper briefly surveys some possibly detection
algorithms and states some open problems. The it
turns to the issue of control and discusses
different modes (on-line, off-line) and methods
(delaying/reordering events). Finally, a fictionous
(but implementable) distributed debugger is
decribed. Overall, this is a very fluent
introductory paper to the issues of observation (and
control) in distributed systems."
}
@InProceedings{Guerraoui:1997:CBM,
author = "Rachid Guerraoui and {Andr\'e} Schiper",
title = "Consensus: the big misunderstanding",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
OPTpages = "???",
booktitle = "Proceedings of the 6th Workshop on Future Trends of
Distributed Computing Systems (FTDCS-6)",
year = "1997",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
month = oct,
OPTnote = "",
annote = "This paper tries to clarify six popular
misunderstandings about the consensus problem that
prevent consensus as being considered fundamental
both in theory and in practice. The
misunderstandings are: (1) Consensus is for
theoreticians only, (2) Time-outs are enough, (3)
There is no life after FLP, (4) The failure detector
model is unrealistic, (5) Time-free means
inefficient, (6) Asynchronous algorithms cannot be
used for time critical applications. A very concise
and well readable paper that does good summarizing
work and is a good source for arguments."
}
@InProceedings{Guerraoui:1997:GAM,
author = "Rachid Guerraoui and {Andr\'e} Schiper",
title = "Genuine atomic multicast",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
number = "1320",
series = ser-LNCS,
pages = "141--154",
booktitle = pro-WDAG97,
year = "1997",
OPTorganization = "",
publisher = pub-SV,
OPTaddress = "",
month = sep,
OPTnote = "",
annote = "The authors define genuine atomic multicast to be an
atomic multicast with a specific minimality property,
i.e. that only the processes in the multicast group
``act'' and others remain ``quiet'' (this is as
opposed to atomic multicast faked by an underlying
atomic broadcast). They show that genuine atomic
multicast is strictly stronger than atomic broadcast
in that it needs a perfect failure detector (thus
stricter synchrony assumptions) to be solvable in
asynchronous systems. They argue, that it is exactly
the minimality requirement that makes the problem
unsolvable with unreliable failure detection."
}
@Article{Guerraoui:1997:SBR,
author = "Rachid Guerraoui and Andr{\'e} Schiper",
title = "Software-Based Replication for Fault Tolerance",
journal = j-IEEE-COMPUTER,
volume = "30",
number = "4",
pages = "68--74",
month = apr,
year = "1997",
keywords = "correctness criterion; cost; fault tolerance; group
communication; linearizability; message passing;
off-the-shelf hardware; replicated servers; replicated
service implementation techniques; reviews; software
fault tolerance; software-based replication;
specialized hardware; survey",
treatment = "G General Review",
annote = "This is a general survey over software based
replication techniques to achieve fault tolerance
with a strong emphasis on the relations to group
communication and consensus. Issues of
view-synchronous and totally ordered communication
and their relation to consensus using unreliable
failure detectors are discussed. In general, this is
a lightweight overview paper that doesn't upset your
tummy."
}
@Article{Hsueh:1997:FIT,
author = "Mei-Chen Hsueh and Timothy K. Tsai and Ravishankar K.
Iyer",
title = "Fault Injection Techniques and Tools",
journal = j-IEEE-COMPUTER,
volume = "30",
number = "4",
pages = "75--82",
month = apr,
year = "1997",
annote = "A survey over current fault injection techniques and
tools. A good quote. Interesting are the different
types of software fault injection techniques and
their relations to the program transformational
approach in describung failure models
\cite{Gaertner:1998:SFT}. They have the same
underlying idea but a different purpose: one is
experimental (and dynamic) and the other is
theoretical (and static). A German reference is
\cite{Echtle:1998:FMB}."
}
@TechReport{Hurfin:1997:CAS,
author = "Michel Hurfin and Achour {Most\'efaoui} and Michel Raynal",
title = "Consensus in asynchronous systems where processes
can crash and recover",
institution = "Institut de Recherche en Informatique et Syst\`emes
Al\'eatoires (IRISA)",
year = 1997,
number = 1144,
address = "Campus de Beaulieu, 35042 Rennes Cedex, France",
month = nov,
annote = "[to read] surveyed in \cite{Aguilera:1998:FDC}.
Published at SRDS'98 \cite{Hurfin:1998:CAS}."
}
@INPROCEEDINGS{Kakugawa:1997:DSD,
AUTHOR = "Hirotsugu Kakugawa and Masaaki Mizuno and Mikhail
Nesterenko",
TITLE = "Development of self-stabilizing distributed
algorithms using transformation: case studies",
PAGES = "16-30",
BOOKTITLE = pro-wss97,
YEAR = 1997,
annote = "The authors evaluate their transformation algorithm
\cite{Mizuno:1996:TBT} from the serial model to the
distributed model on several examples including
lock-based mutual exclusion and leader
election. They conclude that transformed algorithms
have a larger message complexity (which depends on
the choice of timeout values) but this is paid off
by sparing the hassle of developing, debugging and
verifying algorithms for the distributed model from
scratch. Simulation results suggest, that both types
of algorithms have the same asymptotic message
complexity."
}
@TechReport{Kreitz:1997:FRC,
author = "Christoph Kreitz",
title = "Formal reasoning about communication systems {I}:
{Embedding} {ML} into type theory",
institution = "Cornell University",
year = "1997",
OPTcrossref = "",
OPTkey = "",
OPTtype = "",
number = "TR97-1637",
address = "Ithaca",
month = jul,
OPTnote = "",
annote = "Abstract: We present a semantically correct
embedding of a subset of the Ocaml programming
language into the type theory of NuPRL. The subset
is that needed to build the Ensemble group
communication system. We describe the essential
methodologies for representing language constructs
by type-theoretical expressions. Tactics
representing derived inference rules and a
programming logic for these constructs will be
discussed as well as algorithms for translating an
Ocaml-program into NuPRL-objects and vice versa. The
formal representations and the translation
algorithms will serve as the foundation for the
development of automated reasoning tools for the
verification and optimization of a group
communication systems. [(noch) nicht ausgedruckt]"
}
@Article{Kuhn:1997:SFP,
author = "D. Richard Kuhn",
title = "Sources of Failure in the Public Switched Telephone
Network",
journal = j-IEEE-COMPUTER,
volume = "30",
number = "4",
pages = "31--36",
month = apr,
year = "1997",
annote = "[to read]"
}
@InProceedings{Kulkarni:1997:CDM,
author = "Sandeep S. Kulkarni and Anish Arora",
title = "Compositional design of multitolerant repetitive
{Byzantine} agreement",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
OPTpages = "",
booktitle = "Proceedings of the 18th International Conference on
the Foundations of Software Technology and
Theoretical Computer Science, Kharagpur, India",
year = "1997",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
OPTmonth = "",
OPTnote = "",
annote = "Byzantine agreement is taken as an application
example of building fault tolerant programs using the
detectors and correctors methodology of Arora and
Kulkarni \cite{Arora:1998:CDM}."
}
@InProceedings{Malkhi:1997:UID,
author = "Dahlia Malkhi and Michael Reiter",
title = "Unreliable Intrusion Detection in Distributed Computations",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "116--124",
booktitle = "Proceedings of the 10th Computer Security
Foundations Workshop (CSFW97)",
year = "1997",
OPTorganization = "",
OPTpublisher = "",
address = "Rockport, MA",
month = jun,
OPTnote = "",
annote = "[to read]"
}
@Book{Menezes:1997:HAC,
author = "Alfred J. Menezes and Paul C. Van Oorschot and Scott
A. Vanstone",
title = "Handbook of Applied Cryptography",
publisher = "CRC Press, Boca Raton, FL",
year = "1997",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
OPTaddress = "",
OPTedition = "",
OPTmonth = "",
OPTnote = "",
annote = "Brilliant and beautiful book on all aspects of
cryptography with a strong practical perspective
without diving into source code (like Schneier)."
}
@Article{Nelles:1997:NNI,
author = {O. Nelles and S. Ernst and R. Isermann},
title = {{Neuronale Netze zur Identifikation nichtlinearer
dynamischer Systeme: ein \"Uberblick}},
journal = {Automatisierungstechnik},
year = {1997},
OPTkey = {},
volume = {45},
number = {6},
pages = {251--262},
OPTmonth = {},
OPTnote = {},
annote = {[Angabe von Armin]}
}
@TechReport{Oliveira:1997:CCR,
author = "Rui Oliveira and Rachid Guerraoui and {Andr\'e} Schiper",
title = "Consensus in the crash-recover model",
institution = "EPFL -- {D\'epartment} d'Informatique, Lausanne,
Switzerland",
year = "1997",
OPTcrossref = "",
OPTkey = "",
OPTtype = "",
number = "TR-97/239",
OPTaddress = "",
month = aug,
OPTnote = "",
annote = "[to read] surveyed in \cite{Aguilera:1998:FDC}."
}
@InProceedings{Pagnia:1997:TMP,
title = "Towards Multiple-Payment Schemes for Digital Money",
author = "Henning Pagnia and Ralph Jansen",
pages = "203--215",
booktitle = "Financial Cryptography: First International
Conference, {FC}~'97",
editor = "Rafael Hirschfeld",
series = ser-LNCS,
volume = "1318",
year = "1997",
month = "24--28~" # feb,
address = "Anguilla, British West Indies",
publisher = pub-SV,
ISBN = "3-540-63594-7",
references = "{CRYPTO::Brands1993} {CRYPTO::chaumFN1988}
{EUROCRYPT::ChaumP1992}
{CRYPTO::Ferguson1993}
{EUROCRYPT::Jakobsson1995}",
annote = "[to read] reinvented in \cite{Riordan:1998:CEP}."
}
@InProceedings{Prisco:1997:RPA,
author = "Roberto De Prisco and Butler Lampson and Nancy Lynch",
title = "Revisiting the Paxos Algorithm",
booktitle = pro-wdag97,
pages = "111--125",
year = "1997",
annote = "[to read]"
}
@Misc{Rock:1997:TSC,
OPTkey = {},
author = {Georg Rock and Werner Stephan and Andreas Wolpers},
title = {Tool support for the compositional development
of distributed systems},
howpublished = {\url{http://www.dfki.uni-sb.de/vse/papers/rsw97.ps.gz}},
month = may,
year = {1997},
OPTnote = {},
annote = {[to read] published elsewhere?}
}
@INPROCEEDINGS{Rushby:1997:SFV,
AUTHOR = {John Rushby},
TITLE = {Systematic Formal Verification for Fault-Tolerant
Time-Triggered Algorithms},
BOOKTITLE = {Dependable Computing for Critical Applications---6},
SERIES = {Dependable Computing and Fault Tolerant Systems},
Volume = 11,
YEAR = 1997,
EDITOR = {Mario Dal Cin and Catherine Meadows and William H. Sanders},
PUBLISHER = {IEEE Computer Society},
ADDRESS = {Garmisch-Partenkirchen, Germany},
MONTH = mar,
PAGES = {203--222},
annote = "Rushby argues for the separation of algorithm
functionality and timeliness properties. Proofs for
time-critical modules can be quite combersome if
they are tried as is, but they can become much
simpler if the abstract functionality is proven
correct and they are then embedded into a real-time
environment in a safe way by a once-and-for-all
proven methodology (an idea also proposed by Le Lann
\cite{LeLann:1995:ORN}). Rushby presents such a
transformation for (synchronous) round based
algorithms: such an algorithm can be mechanically
transformed into a time-triggered implementation
with tight real-time bounds mechanically. The case
is made by transforming the famous oral message BGP
protocol \cite{Lamport:1982:BGP} into a
time-triggered version by hand and using the PVS
automated proof system."
}
@Article{Schiper:1997:ECA,
author = "{Andr\'e} Schiper",
title = "Early consensus in an asynchronous system with a
weak failure detector",
OPTcrossref = "",
OPTkey = "",
journal = j-DC,
year = "1997",
volume = "10",
number = "3",
pages = "149--157",
OPTmonth = "",
OPTnote = "",
annote = "The author presents a new algorithm for consensus in
asynchronous systems which is an improvement over
the original algorithm by Chandra and Toueg
\cite{Chandra:1996:UFD}. Both use an unreliable
eventually strong failure detector. The new early
consensus algorithm uses the rotating coordinator
paradigm and proceeds in asynchronous rounds. At the
beginning of a round, the coordinator sends its
estimate to all and tries to impose this value on
the rest. A process receiving this estimate reissues
it to all. As soon as a process receives this
estimate from a majority of processes, it decides on
that estimate. The algorithm ensures that once a
majority of processes have adopted the same
estimate, this value is locked and doesn't change
anymore. So once a process decides, all other
processes that decide do not decide differently. The
failure detector ensures the liveness of the
protocol. In comparison to the original
Chandra/Toueg algorithm (CT) early consensus uses
$n(n-1)$ messages to reach a decision in
point-to-point networks while CT uses $3(n-1)$
messages. However, the decision value must be sent
to all (to cater for failure cases), and so both
algorithms needs an additional $n(n-1)$ messages for
the total execution. Both therefore have $O(n^2)$
message complexity. However, early consensus has a
lower latency degree. The latency degree is defined
to be the largest timestamp of logical time, where
``messages tick''. This is a more precise measure
for the number of rounds that an algorithm needs to
execute. Early consensus has a latency degree of 2,
whereas CT has a latency degree of 4 (easily
optimzed to 3). The efficiency stems from improving
parallelism by adding messages in the second part of
a round. So early consensus is both an improvment in
simplicity as it is in efficiency: See also Erratum
\cite{Schiper:1997:EEC}."
}
@Article{Schiper:1997:EEC,
author = "{Andr\'e} Schiper",
title = "Erratum: Early consensus in an asynchronous system
with a weak failure detector",
journal = j-DC,
year = "1997",
volume = "10",
pages = "198",
annote = "corrections of lines 34 and 46 in Figure 1 of page
153."
}
@InProceedings{Setz:1997:DIA,
author = {Thomas Setz},
title = {Design, implementation and performance of a fault tolerant
tuple space machine},
booktitle = {Proceedings of the International Conference on Parallel
and Distributed Systems (ICPADS'97)},
OPTcrossref = {},
OPTkey = {},
pages = {10--13},
year = {1997},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Seoul, Korea},
month = dec,
OPTorganization = {},
publisher = pub-IEEE-CSP,
OPTnote = {},
annote = {Conference version of \cite{Setz:1997:DIP}.}
}
@TechReport{Setz:1997:DIP,
author = {Thomas Setz},
title = {Design, Implementation and Performance of a Mutex-Token based
Fault-Tolerant Tuple Space Machine},
institution = {Sonderforschungsbereich 124, Universit{\"a}t des Saarlandes},
year = {1997},
OPTkey = {},
OPTtype = {},
number = {SFB 124 - 09/1997, TP D5},
address = {Fachbereich Informatik, 66041 {Saarbr\"ucken}, Germany},
month = jul,
OPTnote = {},
url = "\url{http://cdc-server.cdc.informatik.tu-darmstadt.de/home/LiPS/LiPS/documentation/objects/doc/html/papers/FTTM/FTTM_technical_report_sb/FTTM_technical_report_sb.html}",
annote = {Introduction to LiPS and description of the memberschip
protocol used to make the tuple space engine fault
tolerant. Appeared at ICPADS'97 \cite{Setz:1997:DIA}.}
}
@InProceedings{Sims:1997:RMS,
author = "J. T. Sims",
title = "Redundancy Management Software Services for Seawolf
Ship Control System",
pages = "390--394",
booktitle = pro-ftcs97,
ISBN = "0-8186-7831-3",
month = jun,
publisher = "IEEE",
address = "Washington - Brussels - Tokyo",
year = "1997",
annote = "Seawolf is a ``new'' class of US Navy attack
submarines. Its computer system is quadruply redundant with four
independent fault containment regions which use Byzantine tolerant
voting to achieve consensus on output. The voting process is
implemented in simple hardware and also is quadruply redundant. The
system is masking tolerant against upt to two non-simultaneous
permanent faults before it is fail-safed. Faulty components can be
exchanged online. The processors operate in lock-step synched
mode. Fault detection and isolation methods and reconfiguration
facilities are also described."
}
@InProceedings{Stoller:1997:DGP,
author = {Scott D. Stoller},
title = {Detecting Global Predicates in Distributed Systems
with Clocks},
booktitle = pro-wdag97,
OPTcrossref = {},
OPTkey = {},
OPTeditor = {Marios Mavronicolas and Philippas Tsigas},
OPTvolume = {},
OPTnumber = {1320},
OPTseries = ser-LNCS,
year = {1997},
OPTorganization = {},
OPTpublisher = pub-SV,
OPTaddress = {},
month = sep,
pages = {185--199},
OPTnote = {},
annote = "Stoller proposes a generalization of predicate
detection in distributed computations based on
lattice theory: he shows that any partial order with
certain properties can be used to reason about
consistent global states. From such an ordering
follow generic definitions of the modalities
`possibly' and `definitely' introduced by Cooper and
Marzullo \cite{Cooper:1991:CDG}. The author
instantiates his generic definitions with two orders
which are based on the values of synchronized
clocks. The first is called `definitely occured
before' and the second `possibly occured before'; he
also presents adaptions of known algorithms to
detect them. Such algorithms can be optimized if the
predicate has a certain (conjunctive) form
(analogous to local detectability in constraint
satisfaction \cite{Arora:1996:CSB}). A combination
of possibly and definitely called `instantaneously'
(or `properly') is introduced and
discussed. Application of the results is seen in
online monitoring and debugging of distributed
applications, not in fault tolerance, although the
example of debugging database coherence protocols is
near to detecting illegal states."
}
@TechReport{Weber:1997:DAW,
author = "Michael Weber and Rolf Walter and Hagen {V\"olzer}
and Tobias Vesper and Wolfgang Reisig and Sibylle
Peuker and Ekkart Kindler and {J\"orn} Freiheit and
{J\"org} Desel",
title = "{DAWN}: {Petrinetzmodelle} {zur} {Verifikation}
{Verteilter} {Algorithmen}",
institution = "Humboldt-{Universit\"at} Berlin, Institut {f\"ur}
Informatik",
year = "1997",
OPTcrossref = "",
OPTkey = "",
type = "Informatik-Bericht",
number = "88",
address = "Unter den Linden 6, D-10099 Berlin",
month = dec,
OPTnote = "",
OPTannote = "[to read]"
}
@Misc{Wilhelm:1997:CPO,
author = "Uwe G. Wilhelm",
title = "Cryptographically Protected Objects",
month = may,
year = 1997,
note = "A french version appeared in the Proceedings
of RenPar'9, Lausanne, CH. {{\tt
http://lsewww.epfl.ch/\~{}wilhelm/CryPO.html}}",
annote = "presents the idea of a tamper proof computing environment."
}
@TechReport{Aguilera:1998:FDCTR,
title = "Failure Detection and Consensus in the Crash-Recovery
Model",
author = "Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg",
number = "TR98-1676",
year = "1998",
institution = "Cornell University, Computer Science Department",
abstract = "We study the problems of failure detection and
consensus in asynchronous systems in which processes
may crash and recover, and links may lose messages. We
first propose new failure detectors that are
particularly suitable to the crash-recovery model. We
next determine under what conditions stable storage is
necessary to solve consensus in this model. Using the
new failure detectors, we give two consensus algorithms
that match these conditions: one requires stable
storage and the other does not. Both algorithms
tolerate link failures and are particularly efficient
in the runs that are most likely in practice --- those
with no failures or failure detector mistakes. In such
runs, consensus is achieved within 3d time and with 4n
messages, where d is the maximum message delay and n is
the number of processes in the system.",
month = apr,
annote = "The authors extend the work on asynchronous
consensus using unreliable failure detectors to a
more severe fault model than previous research has
considered: now nodes may crash and recover, and
links may lose messages. The authors first derive
specifications for failure detectors which are
better suited for this new fault model than those
proposed in earlier papers by other authors. They do
this by showing that the usual strong completeness
property for the crash-recovery model (stating that
eventually every bad process is permanently
suspected by all good processes) is too strong
because these detectors have to make predictions on
the future behavior of other processes. They propose
a new form of failure detectors with an infinite
output domain and with different properties that
circumvents the problems of the previous
specification. Next, the authors identify, under
what conditions stable storage is necessary to solve
consensus in such an environment. They show that as
long as the number of always-up processes is less or
equal to the number of eventually-down processes
consensus cannot be reached even if links do not
lose messages and an eventually perfect failure
detector can be used. Saving the proposed/decision
values on stable storage does not help if there are
additionally more than two eventually-down
processes. However, if there are more always-up
processes than bad processes consensus can be solved
even without stable storage (two increasingly
efficient algorithms are given). With stable storage
consensus is solvable if there is a majority of good
processes in the system (an algorithm is given).
So, as long as one can guarantee that more processes
never crash than those processes that are unstable
or will eventually remain down, stable storage is
not needed. If all processes may crash at least
once, stable storage and a majority of good
processes is needed to solve consensus. All results
hold for fair lossy channels."
}
@InProceedings{Aguilera:1998:FDC,
author = {Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg},
title = {Failure Detection and Consensus in the Crash-Recovery Model},
booktitle = {Proceedings of the 12th International Symposium on
Distributed Computing (DISC)},
OPTcrossref = {},
OPTkey = {},
pages = {231--245},
year = {1998},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = ser-LNCS,
OPTaddress = {},
month = sep,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {Described in \cite{Aguilera:1998:FDCTR}, this is a more
citeable reference. Published in DC in 2000
\cite{Aguilera:2000:FDC}.}
}
@Article{Akguel:1998:ICS,
author = {Tayfun {Akg\"ul}},
title = {International Conference on Self-Similar Systems (Cartoon)},
journal = {IEEE -- The Institute},
year = {1998},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
pages = 10,
month = sep,
OPTnote = {},
annote = {Shows some linguistic resemblance to WSS.}
}
@Article{Akguel:1998:TZT,
author = {Tayfun {Akg\"ul}},
title = {Teaching the Z-Transform (Cartoon)},
journal = {IEEE -- The Institute},
year = {1998},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
pages = {12},
month = nov,
OPTnote = {},
annote = {Shows a professor talking to a ZZZZ-sleeping audience.}
}
@InProceedings{Almeida:1998:ULG,
author = {Carlos Almeida and Paulo Ver\'{\i}ssimo},
title = {Using light-weight groups to handle timing failures in
{\em quasi-synchronous} systems},
booktitle = {Proceedings of the 19th IEEE Real-Time Systems Symposium},
year = 1998,
address = {Madrid, Spain},
month = dec,
annote = "covers part of the work described in
\cite{Almeida:1998:QSA}."
}
@TechReport{Almeida:1998:QSA,
author = "Carlos Almeida and Paulo {Ver{\'\i}ssimo} and
{Ant\'{o}nio} Casimiro",
title = "The quasi-synchronous approach to fault-tolerant and
real-time communication and processing",
institution = {Instituto Superior T\'{e}cnico},
year = 1998,
number = {CTI RT-98-04},
address = {Lisboa, Portugal},
month = jul,
annote = "The authors propose a new system model to use for
large-scale fault-tolerant distributed systems, the
quasi-synchronous approach. The authors augment the
asynchronous model by adding a timing failure
detector to the system. A timing failure detector
can perfectly detect the non-timeliness of certain
events within a fixed period of time. With such a
failure detector it is possible to build reliable
systems in asynchronous environments because it is
essentially a perfect failure detector as described
by Chandra and Toueg \cite{Chandra:1996:UFD}. The
authors argue that such a failure detector can be
implemented over modern ``synchronous'' network
communications like ATM or GSM. Thus, only part of
the system (control channels vs. payload channels)
need be synchronous, easing the burdon of practical
implementations. The authors give excellent reviews
of the current work in this area and show several
ways how the timeliness properties of
quasi-synchronous applications can be increased: (1)
by an early delivery causal atomic broadcast, (2) by
dynamically adjusting the QoS (and thus timeliness
deadlines) and (3) by active replication to limit
response times of servers. Overall, this is an
excellent paper which is also suited as an
introduction to the area (after reading
\cite{Chandra:1996:UFD}). Previous ideas appeared in other
form in \cite{Almeida:1996:TFD,Verissimo:1995:QSS}."
}
@InProceedings{Arndt:1998:DLD,
author = {Olaf Arndt and Bernd Freisleben and Thilo Kielmann and
Frank Thilo},
title = {Dynamic load distribution with the {WINNER} system},
booktitle = {Proceedings of the Workshop ``Anwendungsbezogene
Lastverteilung'' (ALV'98)},
OPTcrossref = {},
OPTkey = {},
pages = {77--88},
year = {1998},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {M\"unchen, Germany},
OPTmonth = {},
organization = {Technische Universit\"at M\"unchen},
OPTpublisher = {},
OPTnote = {},
annote = {}
}
@Article{Arora:1998:CDM,
author = "Anish Arora and Sandeep S. Kulkarni",
title = "Component based design of multitolerant systems",
OPTcrossref = "",
OPTkey = "",
journal = j-IEEE-TRANS-SOFTW-ENG,
year = "1998",
volume = "24",
number = "1",
pages = "63--78",
month = jan,
OPTnote = "",
annote = "Refinement of Arora's theory of closure and
convergence \cite{Arora:1993:CCF}: the ability to
tolerate certain kinds of faults is added to a
system in a stepwise manner by adding detectors,
that can detect invalidation of safety, and
correctors, that re-estabilish liveness. By adding
these components, care must be taken, that they do
not interfere with eachother. The application
example developed in the paper is a multitolerant
token ring protocol. The model used is the serial
model. The difficulties of extending it to message
passing models is not discussed."
}
@InProceedings{Arora:1998:DCT,
author = "Anish Arora and Sandeep S. Kulkarni",
title = "Detectors and Correctors: A theory of
fault-tolerance components",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
OPTpages = "",
booktitle = pro-icdcs98,
year = "1998",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
month = may,
OPTnote = "",
annote = "A compact presentation and discussion of
\cite{Arora:1998:CDM}."
}
@Article{Arora:1998:DMF,
author = "Anish Arora and Sandeep S. Kulkarni",
title = "Designing masking fault tolerance via nonmasking
fault tolerance",
OPTcrossref = "",
OPTkey = "",
journal = j-IEEE-TRANS-SOFTW-ENG,
year = "1998",
volume = "24",
number = "6",
OPTpages = "",
month = jun,
annote = "A paper in the line of Arora's theory of correctors
and detectors \cite{Arora:1998:CDM}. A fault
intolerant program is transformed into a non-masking fault
tolerant program by adding correctors and then
transformed into a masking fault tolerant program by
adding detectors. Detectors inhibit normal program
actions when invalidation of the safety predicate is
observed. Thus the program only takes ``safe''
steps. Application examples include Byzantine
agreement, reliable data transfer, mutual exclusion."
}
@InProceedings{Arora:1998:SFC,
author = "Anish Arora and Paul C. Attie and E. Allen Emerson",
title = "Synthesis of fault-tolerant concurrent programs",
pages = "173--182",
booktitle = pro-podc98,
year = "1998",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
OPTmonth = "",
OPTnote = "",
annote = "Based on a synthesis method for concurrent programs
by Emerson and Clarke, this paper extends the
possibilities to synthesize fault-tolerant programs
that can tolerate a certain fault class. Faults are
modelled as state transitions with a possibly
extended state space, and recovery transitions are
used to tolerate these faults. The method is based
on temporal logic specifications. As examples,
solutions to mutual exclusion and barrier
synchronization are synthesized."
}
@InProceedings{Asokan:1998:APO,
author = {N. Asokan and Victor Shoup and Michael Waidner},
title = {Asynchronous protocols for optimistic fair exchange},
booktitle = {Proceedings of the IEEE Symposium on Research in
Security and Privacy},
OPTcrossref = {},
OPTkey = {},
pages = {86--99},
year = {1998},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
url = "http://www.zurich.ibm.com/Technology/Security/publications/1998/ASW98.ps.gz",
month = may,
OPTorganization = {},
OPTpublisher = {},
note = {Printed version contains some errors. Errata sheet is
distributed together with the electronic version.},
annote = {[to read]}
}
@InProceedings{Asokan:1998:OFE,
author = "N. Asokan and Victor Shoup and Michael Waidner",
title = "Optimistic Fair Exchange of Digital Signatures",
pages = "591--606",
note = "A longer version is available as Technical Report
RZ 2973 (\#93019), IBM Research, November 1997 at
http://www.zurich.ibm.com/Technology/Security/publications/1997/ASW97b.ps.gz",
booktitle = "EuroCrypt 98",
year = "1998",
publisher = pub-SV,
editor = "Kaisa Nyberg",
series = ser-LNCS,
annote = "[to read]"
}
@Misc{Autexier:1998:VSE,
OPTkey = {},
author = {Serge Autexier and Dieter Hutter and Bruno Langenstein
and Heiko Mantel and Georg Rock and Axel Schairer and
Werner Stephan and Roland Vogt and Andreas Wolpers},
title = {VSE: {Formal} methods meet industrial needs},
howpublished = {\url{http://www.dfki.uni-sb.de/vse/papers/ahlm98.ps.gz}},
OPTmonth = {},
year = {1998},
OPTnote = {},
annote = {[to read] announced to appear in Software Tools for
Technology Transfer, 1998, Springer, Special issue
on mechanized theorem proving for technology. Contains
case study on ROBERTINO robot control system.}
}
@InProceedings{Beauquier:1998:TFD,
author = {Joffroy Beauquier and Sylvie {Dela\"et} and Shlomi Dolev
and {S\'ebastien} Tixeuil},
title = {Transient fault detectors},
booktitle = {Proceedings of the 12th International Symposium on
DIStributed Computing (DISC'98)},
OPTcrossref = {},
OPTkey = {},
pages = {62--74},
year = {1998},
OPTeditor = {},
OPTvolume = {},
number = {1499},
series = ser-LNCS,
address = {Andros, Greece},
month = sep,
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {[to read]}
}
@Article{Belli:1998:MHS,
author = {Fevzi Belli},
title = {{Methoden und Hilfsmittel f\"ur die systematische
Pr\"ufung komplexer Software}},
journal = j-IS,
year = {1998},
OPTkey = {},
volume = {21},
number = {6},
pages = {337--346},
month = dec,
OPTnote = {},
annote = {Vorstellung von konventionellen Testmethoden und
Testwerkzeuge, Reviews etc.}
}
@MastersThesis{Bendrath:1998:CNR,
author = {Ralf Bendrath},
title = {{Computer und die neue Rolle des Milit\"ars in den USA}},
school = {Freie Universit\"at Berlin, Fachbereich Politische
Wissenschaft},
year = {1998},
OPTkey = {},
type = {Diploma thesis (in German)},
OPTaddress = {},
month = aug,
OPTnote = {},
annote = {Eine sehr detailiierte und quellenreiche Arbeit ueber den
Einluss von Computern auf das Verhaeltnis zwischen
Militaer und der Zivilgesellschaft. Eingegangen wird auf
die neue Rolle des Soldaten im Krieg (Vernetzung,
Integration von Strategie und Taktik), Automatisierung der
Verarbeitung von militaerischen Daten (KI als
Schluesseltechnologie und deren Gefahren), der Begriff des
Information Warfare (Ausweitung computermilitaerischer
Operationen auf den zivilen Bereich, zunehmende
Ununterscheidbarkeit von militaerische und zivilen
Operationen durch Praevention), Probleme des
Sicherheitsbegriffs.}
}
@InProceedings{Chandra:1998:HFF,
author = "S. Chandra and P.M. Chen",
title = "How fail-stop are faulty programs?",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "240--249",
booktitle = pro-ftcs98,
year = "1998",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
month = jun,
OPTnote = "",
annote = "see proceedings"
}
@Article{Chase:1998:DGP,
author = {Craig M. Chase and Vijay K. Garg},
title = {Detection of global predicates: Techniques and their
limitations},
journal = j-DC,
year = {1998},
OPTkey = {},
volume = {11},
number = {4},
pages = {191--201},
OPTmonth = {},
OPTnote = {},
abstract = {We show that the problem of predicate detection in
distributed systems is NP-complete. In the past, efficient
algorithms have been developed for special classes of predicates
such as stable predicates, observer independent predicates, and
conjunctive predicates. We introduce a class of predicates,
semi-linear predicates, which properly contains all of the above
classes. We first discuss stable, observer independent and
semi-linear classes of predicates and their relationships with
each other. We also study closure properties of these classes
with respect to conjunction and disjunction. Finally, we
discuss algorithms for detection of predicates in these
classes. We provide a non-deterministic detection algorithm for
each class of predicate. We show that each class can be
equivalently characterized by the degree of non-determinism
present in the algorithm. Stable predicates are defined as those
that can be detected by an algorithm with the most
non-determinism. All other classes can be derived by
appropriately constraining the non-determinism in this
algorithm.},
annote = {[to read]}
}
@InProceedings{Cristian:1998:TAS,
author = "Flaviu Cristian and Cristof Fetzer",
title = "The timed asynchronous distributed system model",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "140--149",
booktitle = pro-ftcs98,
year = "1998",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
month = jun,
OPTnote = "",
annote = "The authors present a formal definition of a system
model that is claimed to capture the current
behavior of distributed systems like the
Internet. The model makes the following assumptions:
(1) processes have hardware clocks that have bounded
drift rate, (2) processes communicate via unreliable
datagram service with broadcast facility that has
omission/performance failure semantics, (3)
processes have crash/performance failure semantics,
(4) there is no bound on load or failure rate, (5)
services are usually timed, i.e., their
specification prescribes a time interval within
which some transitions will occur. Together with the
optional extensions of stable storage and progress
assumptions the authors claim that this model
adequately reflects todays ``reality'', since
important problems (like consensus etc.) are
solvable in the Internet. The model also caters for
network partitions (they are modeled by sufficiently
many crash/omission failures). The notion of a
bounded drift rate is sufficient to implement a
failure detector that detects untimeliness of
processing or responses and thus can be used to
build fail-aware services \cite{Fetzer:1997:FAA}."
}
@Article{Echtle:1998:FMB,
author = {Klaus Echtle and {Jo\~ao} Gabriel Silva},
title = {{Fehlerinjektion -- ein Mittel zur Bewertung
der Ma\ss{}nahmen gegen Fehler in komplexen
Rechnersystemen}},
journal = j-IS,
year = {1998},
OPTkey = {},
volume = {21},
number = {6},
pages = {328--336},
month = dec,
OPTnote = {},
annote = {Empirische Verl\"asslichkeitsbewertung im Gegensatz zu
analytischer (vgl. \cite{Thurner:1998:VKS}). Englische Referenz ist
\cite{Hsueh:1997:FIT}. Die Autoren beleuchten alle wesentlichen
Aspekte moderner Fehlerinjektionstechniken und beschreiben die
Zusammenhaenge zu anderen Gebieten der Informatik. Z.B. die N\"ahe
der Fehlerinjektion zum normalen Software-Test und zur formalen
Verifikation. Letztere st\"o\ss{}t aber oft an Leistungsgrenzen,
w\"ahrend Fehlerinjektion fast immer einen gegebenen Aufwandsrahmen
ausf\"ullen kann. Zun\"achst werden Techniken der physikalischen
Fehlerinjektion (Einwirkung auf Pins, Bestrahlung durch Schwerionen
oder elektromagnetische Strahlung) und software-implementierte
Fehlerinjektion besprochen. Letztere unterscheidet sich in Injektion
auf der Komponenten-Ebene (direkte Ver\"anderung des Codes, direkte
\"Anderung von Variablen, Programmz\"ahler oder Register) und auf
der System-Ebene (Abschw\"achung von Annahmen \"uber andere
unabh\"angige Prozesse an der Nachrichtenschnittstelle). Die
Verl\"a\ss{}lichkeitsbewertung und die Fehlererfassung
(engl. coverage) geschieht dann durch Auswahl geeigneter,
realistischen Fehlerszenarien und einer ausreichenden Anzahl von
Experimenten. Abschlie\ss{}end wird auf den Test von
Fehlertoleranzverfahren in verteilten Systemen eingegangen: Der
Begriff des Fehlerbereiches wird eingef\"uhrt um die \"ublichen
Fehlermodelle (crash, Byzantine, etc.) zu beschreiben. Fehler
k\"onnen dann an der Nachrichtenschnittstelle injiziert werden. Im
Gegensatz zur formalen Verifikation bietet diese Technik den
Vorteil, da\ss{} das System in einer ``realen'' Umgebung getestet
wird. Ein paar g\"angige Fehlerinjektoren werden
vorgestellt. Insgesamt ein guter \"Uberblick mit einer Art
Markt\"ubersicht \"uber Fehlerinjektoren. Verwiesen wird bei der
formalen Verifikation und Fehlermodellierung auf
\cite{Echtle:1984:FSV}.}
}
@InProceedings{Gaertner:1998:EFR,
author = {Felix C. {G\"artner} and Henning Pagnia},
title = "Enhancing the fault tolerance of replication:
another excercise in constrained convergence",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "29--30",
booktitle = pro-ftcs98-fastabs,
year = "1998",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
month = jun,
OPTnote = "",
OPTannote = ""
}
@InProceedings{Fetzer:1998:MCM,
author = {Christof Fetzer},
title = {The Message Classification Model},
booktitle = {Proceedings of the 17th ACM Symposium on Principles of Distributed Computing},
year = {1998},
month = jun,
address = {Puerto Vallarta, Mexico},
url = {http://www.research.att.com/~christof/MCM},
abstract = " We propose a new system model for asynchronous
distributed systems that we call the message classification
model. Motivation for this model is its ability 1) to support a
restricted but useful form of ``communication by time'' by
classiying messages as either ``slow'' or ``fast'' but without
incorporating neither real-time clocks nor ``time-outs'', and
2) to describe transient and permanent network partitions. The
message classification model allows the definition of different
classes of classification schemes. To show that the model is
indeed useful, we show how one can solve the consensus and the
election problem for a certain class of message classification
schemes.",
annote = " Contains a good overview and comparison of different models
[to read]"
}
@TechReport{Gaertner:1998:FFT,
author = "Felix C. {G\"artner}",
title = "Fundamentals of fault tolerant distributed computing
in asynchronous environments",
institution = "Darmstadt University of Technology",
year = "1998",
number = "TUD-BS-1998-02",
address = "Darmstadt, Germany",
month = jul,
url = "http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-1998-02.ps",
note = "To appear in \textit{ACM Computing Surveys}, 31(1), March 1999.",
annote = "A generalization of Arora and Kulkarni's theory of
correction and detection \cite{Arora:1998:CDM} for
the asynchronous message passing model. The paper
first defines formally important terms like
redundancy, fault and fault tolerance. Then it shows
that fault tolerance cannot be achieved without
redundancy and reveals the two phases necessary in
fault tolerance: detection and correction. Detection
is generalized to possibility detection in
distributed systems and correction is generalized to
imposing a predicate on the system. Fundamental
methodologies of fault tolerant distributed
computing (like fail stop processors, state machine
approach, consensus) are shown to fit nicely into
the framework."
}
@TechReport{Gaertner:1998:SFT,
author = "Felix C. {G\"artner}",
title = "Specifications for Fault Tolerance: {A} Comedy of Failures",
institution = "Darmstadt University of Technology",
year = "1998",
number = "TUD-BS-1998-03",
address = "Darmstadt, Germany",
month = oct,
url = "http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-1998-03.ps.gz",
annote = "[to write]"
}
@Article{Gamache:1998:WCS,
author = {Rod Gamache and Rob Short and Mike Massa},
title = {Windows {NT} clustering service},
journal = j-IEEE-COMPUTER,
year = 1998,
OPTkey = {},
volume = 31,
number = 10,
pages = "55--62",
month = oct,
OPTnote = {},
annote = {A colourful article that praises the clustering service
for high availability in NT 5.0. A service may be
implemented on a cluster of servers (i.e., a set of
identical machines) that all together transparently
provide the service as if one single server were
present. Hardware and software failures can be detected
and failed applications can be restarted on other machines
without interrupting the overall mode of
operation. Several issues have not been touched yet
because of ``technical complexity or schedule pressures'':
these are active replication, process pairs,
primary-backup, non-stop migration of processes and
recovery of shared state between client and server. ``They
will be added to future versions of the product.''}
}
@InProceedings{Garg:1998:DPD,
author = "Vijay K. Garg and J. Roger Mitchell",
title = "Distributed predicate detection in a faulty environment",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
OPTpages = "",
booktitle = pro-icdcs98,
year = "1998",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
OPTmonth = "",
OPTnote = "",
annote = "The first real reference on general predicate
detection in faulty environments. Several issues in
this area are discussed: what type of failure
detectors must be used? What if they produce false
suspicions? How does this affect the validity of the
global predicate? The authors present an algorithm
to reliably detect a subclass of general predicates
in an asycnhronous message-passing environment
subject to process crashes, message loss and channel
crashes. The type of predicates they detect are
set-decreasing and conjunctive. Set-decreasing means
that whenever it holds for a set $S$ of processes,
then it also holds for a set $S'\subseteq
S$. Conjunctive means that it can be written as the
conjunction of local predicates and send-monotonic
channel predictas. Send-monotonic channel predicates
are those that if it is false, merely sending messages
can't make it true. The algorithm is based on one by
Hurfin, Mizuno, Raynal and Singhal
\cite{Hurfin:1996:ODC} for detecting conjunctions of
local predicates. Every process acts as a Monitor
process and control messages are piggybacked on
application messages. The application must ensure
that eventually every process sends a message to
every neighbour. Predicate detection is performed by
constructing the lattice of consistent global states
starting from an ``earliest'' state at every
process. Nodes that are suspected to have failed are
not inspected for predicate evaluation. This is okay
for this special type of predicates. The failure
detector used satisfies weak completeness and
infinitely often accuracy, meaning that every
correct process is never permanently suspected. This
is a weaker failure detector than the ``eventually
weak'' failure detector of \cite{Chandra:1996:UFD}."
}
@InProceedings{Garg:1998:IFD,
author = {Vijay K. Garg and J. Roger Mitchell},
title = {Implementable failure detectors in asynchronous systems},
booktitle = {Proc. 18th Conference on Foundations of Software
Technology and Theoretical
Computer Science},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {1998},
OPTeditor = {V. Arvind and R. Ramanujin},
OPTvolume = {},
number = {1530},
series = ser-LNCS,
address = {Chennai, India},
month = dec,
OPTorganization = {},
publisher = pub-SV,
url = "\url{http://maple.ece.utexas.edu/TechReports/1998/TR-PDS-1998-004.ps.Z}",
OPTnote = {},
OPTannote = {}
}
@Book{Gertler:1998:FDD,
author = {J. Gertler},
ALTeditor = {},
title = {Fault Detection and Diagnosis in Engineering Systems},
publisher = {Marcel Dekker},
year = {1998},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {New York},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {[Angabe von Armin]}
}
@Article{Grosspietsch:1998:FKN,
author = {Karl-Erwin {Gro\ss{}pietsch} and Erik Maehle},
title = {{Fehlerbehandlung in komplexen nebenl\"aufigen
Systemen}},
journal = j-IS,
year = {1998},
OPTkey = {},
volume = {21},
number = {6},
pages = {347--355},
month = dec,
OPTnote = {},
annote = {Konzentriert sich auf Fehlertoleranzmassnahmen zur Wahrung
von bestimmten Systemtopologien (Array, Baum, etc.). Stichworte:
dynamische Redundanz, Rekonfiguration, fehlertolerantes Routing,
Recovery.}
}
@InCollection{Hohl:1998:TLB,
author = {F. Hohl},
title = {Time Limited Blackbox Security: Protecting Mobile
Agents from Malicious Hosts},
booktitle = {Mobile Agents and Security},
crossref = {Vigna:1998:MAS},
pages = {92--113},
annote = "referenz von Uwe Wilhelm"
}
@Book{Hoffmann:1998:DMD,
author = {Paul Hoffmann},
title = {{Der Mann, der die Zahlen liebte}},
publisher = {Ullstein},
year = {1998},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
annote = {Biographie von Paul Erd\"os.}
}
@InProceedings{Hurfin:1998:CAS,
author = {Michel Hurfin and A. {Most\'efaoui} and M. Raynal},
title = {Consensus in asynchronous systems where processes
can crash and recover},
booktitle = {Proceedings of the 17th IEEE Symposium on Reliable
Distributed Systems (SRDS'98)},
OPTcrossref = {},
OPTkey = {},
pages = {280--286},
year = {1998},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {West Lafayette, Indiana},
month = oct,
OPTorganization = {},
publisher = pub-IEEE,
OPTnote = {},
annote = {Hinweis aus FG Fehlertolerierende Rechnersysteme
Mitteilungen, Maerz 1999. Previously a
Technical Report \cite{Hurfin:1997:CAS}.}
}
@InProceedings{Hutter:1998:VSE,
author = {Dieter Hutter and Heiko Mantel and Georg Rock and
Werner Stephan and Andreas Wolpers and Michael Balser
and Wolfgang Reif and Gerhard Schellhorn and Kurt Stenzel},
title = {{VSE:} {Controlling} the Complexity in Formal Software
Developments},
booktitle = {Proceedings of the International Workshop on Applied
Formal Methods},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {1998},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Boppard, Germany},
OPTmonth = {},
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {overview over VSE-II.}
}
@Article{Kaiser:1998:EDV,
author = {{J\"org} Kaiser and Edgar Nett},
title = {{Echtzeitverhalten in dynamischen, verteilten Systemen}},
journal = j-IS,
year = {1998},
OPTkey = {},
volume = {21},
number = {6},
pages = {356--365},
month = dec,
OPTnote = {},
annote = {Behandelt den Faktor ``Zeit'' in fehlertoleranten
verteilten Systemen. Dabei sind zwei Aspekte von Bedutung:
Kommunikation und Scheduling. Kommunikation in Echtzeitsystemen
mu\ss{} (1) Vorhersagbarkeit und (2) Kooperation
gew\"ahrleisten. (1) Vorhersagbarkeit bedeutet eine obere Schranke
auf Nachrichtenverz\"ogerung und eine Garantie von Eigenschaften
unter Spitzenlast. Bei ersterem benutzt man reservierungsbasierte
Verfahren (TDMA, braucht globale Zeit) und token-basierte
(token-Ring, braucht keine globale Zeit). Ungeeignet ist Ethernet,
obwohl darauf auch andere Verfahren implementiert werden
k\"onnen. ATM ist eine Mischform. Kommunikationsfehler entstehen
in der Wertedom\"ane und der Zeitdom\"ane. Sie werden in
Fehlersemantiken wie omission oder crash beschrieben und m\"ussen
toleriert werden. (2) Kooperation bedeutet Ordnung auf Nachrichten
und Mitgliedschaft. In Echtzeitsystemen mu\ss{} man irgendwie Zeit
mitspezifizieren. Es kann eine globale, synchronisierte Zeit
angenommen werden (synchrone, eng synchronisierte Systeme) oder es
werden zeitliche Systemannahmen lokal \"uber einen Timeout
realisiert (zeitgesteuerte, asynchrone (engl. timed asynchronous),
lose synchronisierte Systeme). Im eng synchronisierten Fall
gen\"ugt Nachrichtendiffusion bei ausreichender Redundanz. In
asynchronen Systemen braucht man eine Best\"atigung. Unter Hinweis
auf \cite{Fischer:1985:IDC} wird bemerkt, da\ss{} irgendwelche
Zeitannahmen (und seien sie nur unzuverl\"assig
\cite{Chandra:1996:UFD}) ben\"otigt werden, um Konsens zu
erzielen, auf welche Nachrichten sich noch zu warten lohnt und
welche verloren gingen. Diese Zeitannahmen werden als
Gleichm\"a\ss{}igkeit (steadyness) und Laufzeitvarianz (tightness)
bezeichnet \cite{Verissimo:1993:RTC}. Anschlie\ss{}end wird auf
Schedulingverfahren eingegangen. Als Anwendungsbeispiel wird die
GMD-Snake Roboterschlange beschrieben.}
}
@Article{Karat:1998:GRU,
author = {Clare-Marie Karat},
title = {Guaranteeing Rights for the User},
journal = j-CACM,
year = {1998},
OPTkey = {},
volume = {41},
number = {12},
pages = {29--31},
month = dec,
OPTnote = {},
annote = {Contains a ``user's bill of rights'' containing
such items as ``the user is always right'' and ``the user has
the right to a system that performs exactly as promised''.
This is meant as a challenge to the computer industry to change
its current view and points to the problem that dependency on
hard- and software tends to become bigger as maintaining personell
and the industry are able to exploit their sole understanding
of how things work.}
}
@PhdThesis{Kekkonen:1998:RFA,
author = "Synn{\"o}ve Kekkonen",
title = "{R\'esistance} aux {Fautes} dans les {Algorithmes}
{R\'epartis}: {Auto-Stabilisation} et {Tol\'erance}
aux {Fautes}",
school = "{Universit\'e} de Paris-Sud, France",
year = "1998",
OPTcrossref = "",
OPTkey = "",
OPTaddress = "",
OPTmonth = "",
OPTtype = "",
OPTnote = "",
annote = "English title is: ``On Failure Resilience of
Distributed Protocols: Self-Stabilization and
Fault-Tolerance.'' {Synn\"ove} investigates the
(im)possibility of achieving reliability in the
presence of systemic and process failures much in
the tradition of Anagnostou and Hadzilacos
\cite{Anagnostou:1993:TTP}. Failures are modeled as
state transitions in the tradition of Arora and
Gouda \cite{Arora:1993:CCF} and there are hints to
defining a failure model as a program
``augmentation''. The thesis is developed in three
stages: first, there is an elaborate chapter on
modelling distributed systems as transition systems
and defining/proving fault tolerance properties on
them. Second, the self-stabilization approach is
used to build stabilizing failure detectors and to
solve torus orientation in anonymous
networks, where the non-terminating nature of the
self-stabilization paradigm interfaces well with
the impossibility of a terminating solution for the
problem. Third, the possibility of simultaneous
resilience to process and systemic failures is
investigated. Kekkonen proves a main impossibility
result: if a problem is $k$-fault-sensitive in an
asynchronous $(j,k)$-restrictable network subject to
$k>0$ process crashes, then there exists no
$k$-fault-tolerant self-stabilizing solution to the
problem. A network is $(j,k)$-restrictable if some
subnetwork of $j$ nodes can be replaced by a network
of $k$ nodes without changing the ``interface''
structure (e.g., replacing 5 successive nodes in a
ring by a single one). A problem is
$k$-fault-sensitive for a specific network if there
is a $(j,k)$-restriction of the network and the
protocol would reach different solutions depending
on whether these $j$ processes are alive or $k$
processes have crashed. This is an extension of the
result of Anagnostou and Hadzilacos
\cite{Anagnostou:1993:TTP} and their notion of
failure sensitivity. Examples of $k$-fault-sensitive
problems are computing the size of a ring and the
$c$ coloring problem on rings. Examples of
fault-insensitive problems are unique naming,
non-trivial eventual consensus and ring
orientation. A heuristic for finding out whether a
problem is fault-insensitive or not is assume that
the problem can be solved, and then comparing the
set of legitimate states of systems on different
restrictions of the original network. If they do not
differ, then the problem is
fault-insensitive. Overall this is a very thorough
and concise thesis, originally written and defended
in french."
}
@InProceedings{Kreitz:1998:PED,
author = "Christoph Kreitz and Mark Hayden and Jason Hickey",
title = "A proof environment for the development of group
communication systems",
OPTcrossref = "",
OPTkey = "",
editor = "H. Kirchner",
OPTvolume = "",
OPTnumber = "",
series = "Lecture Notes in AI",
OPTpages = "",
booktitle = "15th International Conference on Automated Deduction",
year = "1998",
OPTorganization = "",
publisher = pub-SV,
OPTaddress = "",
OPTmonth = "",
OPTnote = "",
OPTannote = "Ensenble is a group communication environment in the
tradition of Isis and written in OcaML, a language
similar to ML and thus well suited to be manipulated
with NuPRL. The authors show how to import Ensemble
code into NuPRL, verify certain aspects of a
specification and export the code again for
execution. Fault-tolerance is added by using failure
detectors and focus is put on safety
requirements. Timed I/O automata are used as the
basis for formal reasoning about distributed systems.
[bibliographic data needs polish!]"
}
@Unpublished{Kreitz:1998:SWL,
author = "Christoph Kreitz",
title = "``{\textit{Safety}} ist wichtig, {\textit{liveness}}
sieht man.''",
note = "Personal communication.",
OPTcrossref = "",
OPTkey = "",
year = "1998",
month = mar,
annote = "Annotation during a talk on the Ensemble system at
TU Darmstadt, March 12th, 1998, concerning a proof
of a safety property. Liveness was up to that time
of no concern in the project
\cite{Kreitz:1998:PED}. See also the paper on Ariane
5 \cite{Dega:1996:RMA}, which supports this claim."
}
@Article{Kshemkalyani:1998:NSC,
author = "Kshemkalyani and Singhal",
title = "Necessary and Sufficient Conditions on Information for
Causal Message Ordering and their Optimal
Implementation",
journal = j-DC,
volume = "11",
pages = "91--111",
year = "1998",
annote = "[to read]"
}
@InProceedings{Lamport:1998:CWM,
author = {Leslie Lamport},
title = {Composition: {A} way to make proofs harder},
booktitle = {Compositionality: The Significant Difference (Proceedings
of the COMPOS'97 Symposium)},
OPTcrossref = {},
OPTkey = {},
pages = {402--423},
year = {1998},
editor = {Willem-Paul de Roever and Hans Langmaak and Amir Pnueli},
OPTvolume = {},
number = {1536},
series = ser-LNCS,
OPTaddress = {},
OPTmonth = {},
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {Good title - good paper. Argues that compositionality makes
proofs grow fast and that the additional effort is substantial
if no automation is used. Gives an example.}
}
@TechReport{Liu:1998:SVF,
author = {Zhiming Liu and Mathai Joseph},
title = {Specification and verification of fault-tolerance,
timing and scheduling},
institution = {Department of Mathematics and Computer Science,
University of Leicester, U.K.},
year = {1998},
OPTkey = {},
OPTtype = {},
number = {1998/5},
OPTaddress = {},
OPTmonth = {},
OPTnote = {},
annote = {Accepted at ACM TOPLAS. Extends earlier work of Liu and
Joseph \cite{Liu:1992:TPF,Liu:1993:SVR,Liu:1994:SDF,Liu:1996:VFR}
to transformational reasoning about fault-tolerant real-time
systems. The formalism used is TLA and only safety properties of
programs are considered. The computational model and the way to
reason about fault tolerant systems using refinement is
essentially the same as in \cite{Liu:1996:VFR,Liu:1992:TPF} but is
nicely summarized and brought into the TLA framework. The main
body of the paper deals with timing and scheduling. These isuues
are introduced into the formalism by introducing lower and upper
time bounds to actions and adding a real time clock (similar to
\cite{Abadi:1994:OFR}). The global fault assumption must be
extended to specify minimum time lengths in which faults are not
repeated and by assuming that the scheduler is not subject to
faults. Much interest is laid on timing feasability meaning
whether there exists a scheduler to schedule a program
correctly. It is shown how to reason compositionally about
programs combined with schedulers, thus abstracting away from any
specific implementation or policy. This is exemplified by taking a
fixed priority scheduling scheme from the literature and using it
to show feasibility. Discussion of related work mostly covers
scheduling work, while initial historical remarks also deal with
formal methods in fault tolerance. A very good paper; can be seen
as quintessenz of Liu and Joseph's work over the last decade.}
}
@Article{Marcopulos:1998:FBC,
author = "Ted Marcopulos",
title = "Faster, better, cheaper space exploration",
OPTcrossref = "",
OPTkey = "",
journal = "IEEE Spectrum",
year = "1998",
volume = "34",
number = "8",
pages = "68--74",
month = aug,
OPTnote = "",
annote = "The author surveys NASAs recent attempts to apply
commercial management and development schemes to
their current space exploration programs. It turns
out that there is a strive towards eliminating
redundancy in large parts of the system because
components are already reliable enough for unmanned
space flight and redundancy is costly both in
weight, dollars and software/hardware
complexity. This is a good reference together with
\cite{Dega:1996:RMA}."
}
@Article{Marcus:1998:WTD,
author = {Stephen J. Marcus},
title = {What to do about bolts from the blue},
journal = j-IEEE-COMPUTER,
year = 1998,
OPTkey = {},
volume = 35,
number = 12,
pages = "34--41",
month = dec,
OPTnote = {},
annote = {Fascinating report on the danger of the earth being hit by
an asteroid and the issues involved. A large scale example of being
able to tolerate severe faults by detection and correction.}
}
@InProceedings{Merritt:1998:FSO,
author = "Michael Merritt and Gadi Taubenfeld",
title = "Fairness of Shared Objects",
booktitle = {Proceedings of the 12th International Symposium on
DIStributed Computing (DISC'98)},
pages = "303--316",
year = "1998",
series = ser-LNCS,
number = "1499",
month = sep,
address = "Andros, Greece",
annote = "Here, fairness is not defined with respect to processes or
schedulers, but with respect to accesses to distinct shared
objects. This is a way of encapsulating fairness assumptions (and
thus timing assumptions) into modules quite nicely. Four types of
fair objects are considered: dedalock-free (if some process tries to
access some object, eventually some process will succeed to access
that object), starvation-free (if a process tries to access an
object, then he will eventually succeed), bounded-waiting
(deadlock-free and there is an (unknown?) upper bound $r$ on the
number of times that some other process can access an object before
another process wanting to access the object), $r$-bounded-waiting
(deadlock-free and there is a fixed upper bound $r$ on the number of
times other processes can succeed before myself). It turns out that
deadlock-free objects are weaker than starvation-free objects (using
starvation-free objects makes some problems solvable),
starvation-free and bounded-waiting objects are ``similar'' and
$r$-bounded-waiting objects are much stronger than bounded-waiting
objects. A nice result shows that safety properties are immune to
fairness assumptions (similar result is attributed to
\cite{Alur:1997:TAA}. There's a good related work section discussing
the relationship between time/fairness and system models."
}
@InProceedings{Riordan:1998:CEP,
author = {J. Riordan and B. Schneier},
title = {A Certified E-Mail Protocol with No Trusted Third Party},
booktitle = {Proceedings of the 13th Annual Computer Security Applications Conference},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {1998},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
month = dec,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {Presents the same protocol as \cite{Pagnia:1997:TMP}
and relates it to the current Internet infrastructure.
Get it at: http://www.counterpane.com/certified-email.html}
}
@InProceedings{Rothermel:1998:FPP,
author = "Kurt Rothermel and Markus Stra{\ss}er",
year = "1998",
title = "{A Fault-Tolerant Protocol for Providing the
Exactly-Once Property of Mobile Agents}",
booktitle = "Proc. 17th IEEE Symposium on Reliable Distributed
Systems 1998 (SRDS'98)",
publisher = "IEEE Computer Society Press",
address = "Los Alamitos, California",
pages = "100--108",
annote = "[to read]"
}
@InCollection{Sander:1998:PMA,
author = {T. Sander and C. F. Tschudin},
title = {Protecting Mobile Agents Against Malicious Hosts},
booktitle = {Mobile Agents and Security},
crossref = {Vigna:1998:MAS},
annote ="Angabe von Uwe Wilhelm"
}
@InProceedings{Sander:1998:TMC,
author = "T. Sander and C. Tschudin",
title = "Towards Mobile Cryptography",
added-at = "Wed Apr 8 11:17:26 1998",
abstract = "Mobile code technology has become a driving force for
recent advances in distributed systems. The concept of
mobility of executable code raises major security
problems. In this paper we deal with the protection of
mobile code from possibly malicious hosts. We
conceptualize on the specific cryptographic problems
posed by mobile code. We are able to provide a solution
for some of these problems: We present techniques how
to achieve ``non--interactive computing with encrypted
programs'' in certain cases and give a complete
solution for this problem in important instances. We
further present a way how a agent might securely
perform a cryptographic primitive, digital signing, in
an untrusted execution environment. Our results are
based on the use of homomorphic encryption schemes and
function composition techniques.",
online = "http://www.icsi.berkeley.edu/~tschudin/ps/ieee-sp98.ps.gz",
booktitle = "Proceedings of the {IEEE} Symposium on Research in
Security and Privacy",
address = "Oakland, CA",
year = "1998",
publisher = pub-IEEE,
month = may,
OPTorganization = "{IEEE} Computer Society, Technical Committee on
Security and Privacy",
annote = "interesting paper doing a significant step towards
protecting mobile code from it's host without requiring
trusted hardware. currently only solutions for
rationals/polynomial functions are outlined (but not
yet for boolean circuits (equivalent to turing machines
!)) and there is also still a need for secure
birational functions to make the ideas work.",
}
@InProceedings{Schneider:1998:FAN,
author = "Steve Schneider",
title = "Formal Analysis of a Non-Repudiation Protocol",
booktitle = "PCSFW: Proceedings of The 11th Computer Security
Foundations Workshop",
publisher = "IEEE Computer Society Press",
year = "1998",
pages = "54--65",
annote = "The author presents a formal analysis of Zhou/Gollmann
fair non-repudiation protocol \cite{Zhou:1996:FNP} (which is in fact
similar to the protocol of \cite{Pagnia:1999:EGP}). The formalism
used is CSP \cite{Hoare:1984:CSP}. Apart from the rigor in which the
protocol is modeled and proved, an interesting fact here is that the
author also stumbles over the necessity of liveness in the
specification (an aspect discussed in \cite{Pagnia:1999:IFE}): state
can be ``imposed'' on a process by assuring that it is able to make
a state change if the process wants to. This is formalized as the
following liveness property: if process A wants to make a state
change depending on the receipt of message m from the trusted third
party, then A will eventually receive m. This implies that the
trusted third party is continuously available and has m ready and
waiting for delivery to A. In this paper, A queries the trusted
third party. Consequently, reliable communication to the trusted
authority must be assumed. Another interesting point is a
`generates' relation between messages which is used in the
proof. This reminds of the formalization of non-cooperative
Byzantine faults \cite{Echtle:1999:UCB}."
}
@Article{Schneier:1998:CDV,
author = {Bruce Schneier},
title = {Cryptographic design vulnerabilities},
journal = j-COMPUTER,
year = {1998},
OPTkey = {},
volume = {31},
number = {9},
pages = {29--33},
month = sep,
OPTnote = {},
annote = {Briefly discusses the notions of detection and
correction in the context of cryptography and security.}
}
@InProceedings{Siegel:1998:FVS,
author = {Michael Siegel},
title = {Formal verification of stabilizing systems},
booktitle = {Proceedings of the 5th International Symposium on
Formal Techniques in Real Time and Fault Tolerant Systems
(FTRTFTS'98)},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {1998},
editor = {Anders P. Ravn and Hans Rischel},
OPTvolume = {},
number = {1486},
series = ser-LNCS,
address = {Lyngby, Denmark},
month = sep,
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {Describes a calculus to perform formal proofs of
stabilizing algorithms. The environment are fair transitions systems
and temporal logic. Gives proof rules for composing and refining
stabilizing systems.}
}
@InProceedings{Singhai:1998:SFI,
author = "Ashish Singhai and Swee-Boon Lim and Sanjay R. Radia",
title = "The {SunSCALR} framework for Internet Servers",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "108--117",
booktitle = pro-ftcs98,
year = "1998",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
month = jun,
OPTnote = "",
annote = "First available implementation of a self-stabilizing
algorithm in an industrial product. Also an example
for the applications of non-masking fault tolerance."
}
@inproceedings{Stoller:1998:ASB,
author = "Scott D. Stoller and Fred B. Schneider",
title = "Automated Stream-Based Analysis of Fault-Tolerance",
booktitle = "Formal Techniques in Real-Time and Fault-Tolerant Systems",
publisher = pub-SV,
series = ser-LNCS,
volume=1486,
pages="113-122",
year=1998,
month=sep,
address="Lyngby, Denmark",
url = "\url{http://ftp.cs.indiana.edu/pub/stoller/FTRTFT98-extended.ps.gz}",
annote = "[to read]"
}
@InProceedings{Tarafdar:1998:AFC,
author = {Ashis Tarafdar and Vijay K. Garg},
title = {Addressing false causality while detecting predicates in
distributed programs},
booktitle = pro-icdcs98,
OPTcrossref = {},
OPTkey = {},
pages = {94--101},
year = {1998},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Amsterdam, The Netherlands},
OPTmonth = {},
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
url = "http://www.ece.utexas.edu/~garg/dist/dcs98-ashis.ps.Z",
annote = {The general causality relation based on the happened
before relation imposes a causal order on events which may be
causally unrelated. For example, two successive events on one
process are causally ordered by deault, but this order may simply
have been imposed by a random scheduler and the events are in fact
events on two independent threads of the process. This is called
`false causality' and has been a critique of the happened-before
model of distributed computations. The authors present a way of
extending the partial order model to ``split up'' the execution
oder of a process into multiple threads and actually treating
independent events as such in the causality relation. This adds
complexity to the problem and they show that it becomes NP
complete (the original problem is NP complete already
\cite{Chase:1998:DGP}). However, for a restricted class of
predicates (weak conjunctive ones) they give an efficient
algorithm to detect them. States that approaches to predicate
detection fall into three classes: (1) snapshot based ones
\cite{Chandy:1985:DSD} only suitable for stable predicates, (2)
lattice construction based ones \cite{Cooper:1991:CDG} and (3)
restriction based approaches like those of Garg.}
}
@InProceedings{Theel:1998:OPS,
author = "Oliver Theel and Felix C. {G\"artner}",
title = "On proving the stability of distributed algorithms:
self-stabilization vs. control theory",
OPTcrossref = "",
OPTkey = "",
editor = "Vladimir B. Bajic",
volume = "III",
OPTnumber = "",
OPTseries = "",
pages = "58--66",
booktitle = "Proceedings of the International Systems, Signals,
Control, Computers Conference (SSCC'98), Durban,
South Africa",
year = "1998",
OPTorganization = "",
OPTpublisher = "",
OPTaddress = "",
month = sep,
note = "",
annote = "[to write ;-)]"
}
@Article{Thurner:1998:VKS,
author = {Erwin Thurner and Mario Dal Cin and Winfried
{Schneewei\ss{}}},
title = {{Verl\"a\ss{}lichkeitsbewertung komplexer Systeme}},
journal = j-IS,
year = {1998},
OPTkey = {},
volume = {21},
number = {6},
pages = {318--327},
month = dec,
OPTnote = {},
annote = {Deutsche Einf\"uhrung in Begriffe wie Zuverl\"assigkeit,
mittlere Lebensdauer (MTTF), Ausfallrate, Sicherheit, MTBF,
Verf\"ugbarkeit, sowie die Methoden Fehlerb\"aume, Markovketten
und hybride Ans\"atze. Konzentration auf analytische Bewertungen,
nicht auf experimentelle (f\"ur experimentelle siehe
\cite{Echtle:1998:FMB}).}
}
@Book{Vigna:1998:MAS,
editor = {G. Vigna},
title = {Mobile Agents and Security},
publisher = pub-SV,
year = 1998,
volume = 1419,
series = ser-LNCS,
address = {Berlin},
annote = "Angabe von Uwe Wilhelm"
}
@InProceedings{Voelzer:1998:VFT,
author = "Hagen {V\"olzer}",
title = "Verifying fault tolerance of distributed algorithms
formally: {An} example",
OPTcrossref = "",
OPTkey = "",
OPTeditor = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
pages = "187--197",
booktitle = "Proceedings of the International Conference on
Application of Concurrency to System Design (CSD98)",
year = "1998",
OPTorganization = "",
publisher = pub-IEEE,
address = "Fukushima, Japan",
month = mar,
OPTnote = "",
annote = "This paper investigates the fully mechanical
verification of fault tolerant algorithms using the
DAWN approach \cite{Weber:1997:DAW} which is based
on Petri nets. The main point in doing so is to
formally handle faults and fault models. This is
done by distinguishing an (informal) fault model
from a formal fault impact model specified by a
Petri net. In this example, crash and omission
faults are formalized by additional state
transitions which are superimposed onto an algorithm
for fault free executions. Additionally to a fault
impact model, a ``rely'' property belongs to the
fault model. Such a property formalizes
``assumptions about the environment'' like the
maximum number of faults that may occur, and it
makes these assumptions exploitable by a proof. The
example algorithm used is the SELF-2 fault diagnosis
algorithm by Kuhl and Reddy. The paper shows the
advantages of Petri nets in formulating and reasoning
about distributed algorithms. The superimposition
property of such nets make the approach extremenly
usefull for fault tolerant algorithms."
}
@inproceedings{Wilhelm:1998:PTM,
year = {1998},
title = {On the Problem of Trust in Mobile Agent Systems},
author = {U. G. Wilhelm and L. Butty\`an and S. Staamann},
booktitle = {Symposium on Network and Distributed System Security},
publisher = {Internet Society},
keywords = {IMPORTANT; Security},
month = mar,
pages = "114--124",
annote = "[to read]"
}
@Article{Aguilera:1999:UHF,
author = "Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg",
title = "Using the heartbeat failure detector for quiescent
reliable communication and consensus in partitionable
networks",
journal = "Theoretical Computer Science",
volume = "220",
number = "1",
pages = "3--30",
day = "06",
month = jun,
year = "1999",
coden = "TCSCDI",
ISSN = "0304-3975",
bibdate = "Mon Jul 19 22:22:41 MDT 1999",
url = "http://www.elsevier.com/cas/tree/store/tcs/sub/1999/220/1/3045.pdf",
acknowledgement = ack-nhfb,
annote = "[to read]"
}
@TechReport{Aguilera:1999:WFD,
year = "1999",
number = "TR99-1741",
institution = "Cornell University, Computer Science",
title = "On the Weakest Failure Detector for Uniform
Reliable Broadcast",
author = "Marcos Kawazoe Aguilera and Sam Toueg and Borislav
Deianov",
abstract = "Uniform Reliable Broadcast (URB) is a communication
primitive that requires that if a process delivers a
message, then all correct processes also deliver this
message. A recent PODC paper \cite{Halpern:1999:KAU}
uses Knowledge
Theory to determine what failure detectors are
necessary to implement this primitive in asynchronous
systems with process crashes and lossy links that are
fair. In this paper, we revisit this problem using a
different approach, and provide a result that is
simpler, more intuitive, and, in a precise sense, more
general.",
month = apr # " 30,",
annote = ""
}
@Article{Benassi:1999:T,
author = "Paola Benassi",
title = "{TRUSTe}: An online privacy seal program",
journal = "Communications of the ACM",
volume = "42",
number = "2",
pages = "56--59",
month = feb,
year = "1999",
coden = "CACMA2",
ISSN = "0001-0782",
bibdate = "Fri Feb 5 07:01:55 MST 1999",
url = "http://www.acm.org:80/pubs/citations/journals/cacm/1999-42-2/p56-benassi/",
acknowledgement = ack-nhfb,
annote = "TRUSTe is a trustmark or a seal which providers can put onto
their web pages in order to indicate sound privacy practices.
The trustmark is issued by a organization which checks the
pages (\url{www.truste.org}). See also \cite{Reagle:1999:PPP}."
}
@Article{Billinghurst:1999:WDN,
author = "Mark Billinghurst and Thad Starner",
title = "Wearable Devices: New Ways to Manage Information",
journal = "Computer",
volume = "32",
number = "1",
pages = "57--64",
month = jan,
year = "1999",
coden = "CPTRB4",
ISSN = "0018-9162",
bibdate = "Fri Jan 15 16:17:58 MST 1999",
url = "http://www.computer.org/computer/co1999/r1057abs.htm;
http://dlib.computer.org/co/books/co1999/pdf/r1057.pdf",
acknowledgement = ack-nhfb,
annote = "A thrilling and fascinating article on a somewhat
underestimated branch of computer science. Computers can be
incorporated into clothing, eyeglasses, can be worn around the neck,
in a wristwatch, etc. Applications of wearable computers (also
non-military) are given: navigation using augmented reality,
wearable bar code scanners at UPS. The article also takes a shot at
predicting what comes next: for example using augmented reality to
do conferencing. Pointers to conferences, companies and research
projects concerning wearables round up the article. For
a market survey as of 2000 see \cite{Ditlea:2000:PCG}."
}
@Article{Boyle:1999:DYT,
author = "James M. Boyle and R. Daniel Resler and Victor L.
Winter",
title = "Do You Trust Your Compiler?",
journal = "Computer",
volume = "32",
number = "5",
pages = "65--73",
month = may,
year = "1999",
url = "http://www.computer.org/computer/co1999/r5065abs.htm;
http://dlib.computer.org/co/books/co1999/pdf/r5065.pdf",
annote = "There are two problems involved when using formal methods
to produce correct software: (1) coming up with an accurate formal
specification of the problem, and (2) producing a correct
implementation of the specification bzw. verifying that a given
implementation is correct regarding the specification. This paper
addresses the second problem and uses buggy compilers to motivate
it. Bugs in compilers are well-documented (see news:gnu.gcc.bug for
example). The idea is to start with a high level code and apply
correctness preserving transformations to it until a lower level
code is reached. Denotational sematics are used to define
`correctness preserving'. As an open research problem it is noted
that producing code from safety and liveness specifications would be
good."
}
@InProceedings{Cardellini:1999:RAL,
author = {Valeria Cardellini and Michele Colajanni and Philip S. Yu},
title = {Redirection Algorithms for Load Sharing in Distributed
Web-server Systems},
booktitle = pro-icdcs99,
OPTcrossref = {},
OPTkey = {},
pages = {528--535},
year = {1999},
editor = {Mohamed G. Gouda},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
month = {May/June},
OPTorganization = {},
publisher = pub-IEEE,
OPTnote = {},
annote = {[to read]}
}
@Article{Carreira:1999:FIS,
author = {Jo\~ao Viegas Carreira and Diamantino Costa
and Jo\~ao Gabriel Silva},
title = {Fault injection spot-checks computer system dependability},
journal = {IEEE Spectrum},
year = {1999},
OPTkey = {},
volume = {36},
number = {8},
pages = {50--55},
month = aug,
OPTnote = {},
annote = {A good motivation and introduction to fault injection from
a more hardware point of view than
\cite{Hsueh:1997:FIT,Echtle:1998:FMB}. Contains terms Heisenbugs (a
failure that is not reconstructable), and Bohrbugs (the
opposite). Like \cite{Rushby:1994:CSP} states that attaching
reliability figures to a system is poblematic, even if the failure
model is precisely fixed. States that there is research in Sematech,
HP, Cpmpaq and Stanford to collect real fault data and thus enable
more realistic failure models.}
}
@Article{Cristian:1999:TAD,
author = "Flaviu Cristian and Christof Fetzer",
title = "The Timed Asynchronous Distributed System Model",
journal = "{IEEE} Transactions on Parallel and Distributed
Systems",
year = "1999",
volume = "10",
number = "6",
month = jun,
url = "http://www-cse.ucsd.edu/users/cfetzer/MODEL/",
abstract = "We propose a formal definition for the timed
asynchronous distributed system model. We present
extensive measurements of actual message and process
scheduling delays and hardware clock drifts. These
measurements confirm that this model adequately
describes current distributed systems such as a network
of workstations. We also give an explanation of why
practically needed services, such as consensus or
leader election, which are not implementable in the
time-free model, are implementable in the timed
asynchronous system model.",
language = "English",
annote = "A revised version of \cite{Cristian:1998:TAS}."
}
@InProceedings{Echtle:1999:UCB,
author = {Klaus Echtle and Asif Masum},
title = {Understanding Cooperative Byzantine Failures: A Novel Failure
Classification to Enable Efficient Fault-Tolerant Protocols},
booktitle = {Proceedings of the Annual IEEE Workshop on Fault-Tolerant
Parallel and Distributed Systems (FTPDS'99)},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {1999},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {San Juan, Puerto Rico, USA},
month = apr,
OPTorganization = {},
publisher = {Kluwer},
OPTnote = {},
annote = {The authors present a unifying approach to modeling the
common fault classes formally. This is the most general approach
that I know of and in contrast to \cite{Gaertner:1998:SFT} seems to
be easier to adapt to the common fault classes (and is able to
derive new ones). The work contains a near-to-complete list of
references to fault classification work and puts the terms fault,
error and failure in a nice layered context (p. 2). The model
consists of a set of $n$ components that can be seperated into fault
free and faulty ones. Components communicate by sending messages
from some fixed message set. Sending and receipt of a message
trigger events. An event is a tuple consisting of the message, the
event type (send/receive) and a time tag, which specifies the global
point in continuous real time in which the event occurs. Component
behaviors can now be described as event sets, which through the time
tag implicitly define a single (?!) sequence of events (not a set of
sequences?). A specification $S_i$ for component $i$ is a set of
correct input/output tuples, i.e. a relation over input sequences
and output sequences. Failure modes are defined in a functional way:
a failure mode identifies sets of behaviors which a component may
exhibit following the occurence of a set of receive events. Now it
is possible to define the different ``failure mode functions'' for
correct behavior, fail-silent, fail-omission, message loss, message
duplication etc. by changing tags in message sets or message sets
themselves. To define failures affecting code integrity
(e.g. altered messages) the authors define the concept of a failure
capability $C_i$ for component $i$. This can be seen as a degraded
component specification, i.e. is the set of behaviors allowed by $i$
if it is faulty. Using this construct it is possible to derive a
rich set of distinctive failure modes visualized in Fig. 10. As a
further novelty, the authors introduce a new failure mode, that of
non-cooperative Byzantine. This is where no malicious cooperation
takes place between faulty nodes. This is formalized along the idea
that such behavior must be based on either (1) malicious treason
(e.g., revealing a secret key) or (2) malicious delegation
(e.g. some node asks another node to sign a message). Malicious
cooperation is then defined (on p. 19) as ``increasing the failure
capabilty by the receipt of a message'' (see also the `generates'
relation of \cite{Schneider:1998:FAN}). Non-cooperative behavior is
defined as the complement of malicious cooperation. It is nice to
have different types of Byzantine behaviors because this can result
in protocols that are more efficient. This is shown by example.
Overall a formal, but very rewarding paper which can also be used as
an overview over the state of the art in failure classification.
See also \cite{Echtle:2000:FFM} and Asif's thesis.}
}
@InProceedings{Essame:1999:PPA,
author = {Didier Essame and Jean Arlat and David Powell},
title = {Padre: {A} protocol for asymmetric duplex redundancy},
booktitle = {Proceedings of the Seventh IFIP International Working
Conference on Dependable Computing for Critical Applications},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {1999},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {San Jose, USA},
month = jan,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {[to get] uses the timed asynchronous model to build
a fully automated train control system, cited in
\cite{Cristian:1999:TAD}.}
}
@InProceedings{Felber:1999:FDF,
year = "1999",
address = "Edinburgh, Scotland",
pages = "132--141",
title = "Failure Detectors as First Class Objects",
author = "Pascal Felber and Xavier D\'efago and Rachid
Guerraoui and P. Oser",
booktitle = "Proceedings of the International Symposium on
Distributed Objects and Applications (DOA'99)",
month = sep,
annote = "[to get]"
}
@Article{Felber:1999:POD,
author = "Pascal Felber and Rachid Guerraoui and Mohamed E.
Fayad",
title = "Putting {OO} distributed programming to work",
journal = "Communications of the ACM",
volume = "42",
number = "11",
pages = "97--101",
month = nov,
year = "1999",
url = "http://www.acm.org/pubs/articles/journals/cacm/1999-42-11/p97-felber/p97-felber.pdf;
http://www.acm.org/pubs/citations/journals/cacm/1999-42-11/p97-felber/",
annote = "Discusses different approaches to specify, model and implement
failure detectors. Distinguishes the push model, pull model and
the dual model (combination of push and pull). Similar title is
\cite{Felber:1999:FDF}. Failure detector implementations also
discussed in \cite{Sergent:1999:FDI}."
}
@InProceedings{Fetzer:1999:CTA,
author = {Christof Fetzer},
title = {A comparison of timed asynchronous systems and
asynchronous systems with failure detectors},
booktitle = {Proceedings of the Third European Research Seminar
on Advances in Distributed Systems (ERSADS'99)},
OPTcrossref = {},
OPTkey = {},
pages = {109--118},
year = {1999},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Madeira Island, Portugal},
month = apr,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {[to write]}
}
@Article{Gabber:1999:CYA,
author = "Eran Gabber and Phillip B. Gibbons and David M.
Kristol and Yossi Matias and Alain Mayer",
title = "Consistent, yet anonymous, {Web} access with {LPWA}",
journal = "Communications of the ACM",
volume = "42",
number = "2",
pages = "42--47",
month = feb,
year = "1999",
coden = "CACMA2",
ISSN = "0001-0782",
bibdate = "Fri Feb 5 07:01:55 MST 1999",
url = "http://www.acm.org:80/pubs/citations/journals/cacm/1999-42-2/p42-gabber/",
acknowledgement = ack-nhfb,
annote = "The LPWA is the Lucent Personalized Web Assistant, a tool which
helps you manage different pseudonyms and thus manage anonymity
on the web. Related articles are about Crowds
\cite{Reiter:1999:AWT}, onion routing \cite{Goldschlag:1999:OR},
and \cite{Reagle:1999:PPP,Benassi:1999:T}."
}
@InProceedings{Gaertner:1999:AFD,
author = {Felix C. {G\"artner} and Henning Pagnia and Holger Vogt},
title = {Approaching a formal definition of fairness in
electronic commerce},
booktitle = {Proceedings of the International Workshop on Electronic
Commerce (WELCOM'99)},
OPTcrossref = {},
OPTkey = {},
pages = {354--359},
year = {1999},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Lausanne, Switzerland},
month = oct,
OPTorganization = {},
publisher = pub-IEEE,
OPTnote = {},
annote = {[to write]}
}
@Unpublished{Gaertner:1999:DR,
author = {Felix C. {G\"artner} and Hagen {V\"olzer}},
title = {Defining Redundancy in Fault-Tolerant Computing},
note = {unpublished manuscript},
OPTkey = {},
OPTmonth = {},
year = {1999},
OPTannote = {}
}
@InProceedings{Gaertner:1999:ESD,
author = {Felix C. {G\"artner}},
title = {An exercise in systematically deriving fault-tolerance
specifications},
booktitle = {Proceedings of the Third European Research Seminar on
Advances in Distributed Systems (ERSADS)},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {1999},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Madeira Island, Portugal},
month = apr,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {Shorter Version of \cite{Gaertner:1999:ESDFS}.}
}
@TechReport{Gaertner:1999:ESDFS,
author = {Felix C. G\"artner},
title = {An exercise in systematically deriving fault-tolerance
specifications},
institution = {Department of Computer Science, Darmstadt University
of Technology},
year = {1999},
OPTkey = {},
OPTtype = {},
number = {TUD-BS-1999-01},
address = {Darmstadt, Germany},
month = mar,
OPTnote = {Available at http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-1999-01.ps.gz},
annote = {}
}
@Article{Gaertner:1999:FFT,
author = {Felix C. {G\"artner}},
title = {Fundamentals of fault-tolerant distributed computing in
asynchronous environments},
journal = j-ACM-COMP-SURVEYS,
year = {1999},
OPTkey = {},
volume = {31},
number = {1},
pages = {1--26},
month = mar,
OPTnote = {},
annote = {updated version of \cite{Gaertner:1998:FFT}.}
}
@TechReport{Gaertner:1999:FUF,
author = {Felix C. {G\"artner} and Armin Wolfram},
title = {{Fehlererkennung und Fehlerdiagnose f\"ur
verl\"a\ss{}liche Systeme -- Automatisierungstechnik
vs.~verteilte Systeme}},
institution = {Department of Computer Science, Darmstadt University
of Technology},
year = {1999},
OPTkey = {},
OPTtype = {},
number = {TUD-BS-1999-03},
address = {Darmstadt, Germany},
month = jul,
OPTnote = {},
OPTannote = {}
}
@InProceedings{Gaertner:1999:SLD,
author = {Felix C. {G\"artner} and Henning Pagnia},
title = {Self-stabilizing Load Distribution for Replicated
Servers on a Per-Access Basis},
booktitle = pro-wss99,
OPTcrossref = {},
OPTkey = {},
pages = {102--109},
year = {1999},
editor = {Anish Arora},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Austin, TX},
month = jun,
OPTorganization = {},
publisher = pub-IEEE,
OPTnote = {},
annote = {A self-stabilizing extension to existing load balancing
schemes (such as \cite{Arora:1997:OCC,Arora:1995:ECC,Gronning:1990:SDD})
to allow fine grained load distribution based on redirection. Pointers
to commercial realizations appear in \cite{Cardellini:1999:RAL}.}
}
@TechReport{Gaertner:1999:STA,
author = {Felix C. {G\"artner}},
title = {A survey of transformational approaches to the
specification and verification of fault-tolerant systems},
institution = {Department of Computer Science, Darmstadt University
of Technology},
year = {1999},
OPTkey = {},
OPTtype = {},
number = {TUD-BS-1999-04},
address = {Darmstadt, Germany},
month = apr,
note = {To appear in \textit{Journal of Universal Computer Science}
(J.UCS), special issue on ``Dependability Evaluation and
Assessment'' (November, 1999).},
OPTannote = {Journal version \cite{Gaertner:1999:TAS}.}
}
@Article{Gaertner:1999:TAS,
author = {Felix C. {G\"artner}},
title = {Transformational Approaches to the Specification and
Verification of Fault-Tolerant Systems: {Formal}
Background and Classification},
journal = {Journal of Universal Computer Science (J.UCS)},
year = {1999},
OPTkey = {},
volume = {5},
number = {10},
pages = {668--692},
month = oct,
note = {Special Issue on Dependability Evaluation and Assessment},
annote = {Prior technical report \cite{Gaertner:1999:STA}.}
}
@Article{Glass:1999:RST,
author = "Robert L. Glass",
title = "The realities of software technology payoffs",
journal = "Communications of the ACM",
volume = "42",
number = "2",
pages = "74--79",
month = feb,
year = "1999",
coden = "CACMA2",
ISSN = "0001-0782",
bibdate = "Fri Feb 5 07:01:55 MST 1999",
url = "http://www.acm.org:80/pubs/citations/journals/cacm/1999-42-2/p74-glass/",
acknowledgement = ack-nhfb,
annote = "Glass studies which new software engineering practices
have turned out to pay off in the long run. These techologies are:
structured techniques, fourth generation languages, CASE, formal
methods, cleanroom methodology, process models,
object-orientation. Especially interesting to me is the discussion
of formal methods. Glass says that it has been little used because
it still is largely underdefined and underevaluated. Only one study
has brought forward hard numbers \cite{Ralston:1991:FMH}."
}
@Article{Goldschlag:1999:OR,
author = "David Goldschlag and Michael Reed and Paul Syverson",
title = "Onion routing",
journal = "Communications of the ACM",
volume = "42",
number = "2",
pages = "39--41",
month = feb,
year = "1999",
coden = "CACMA2",
ISSN = "0001-0782",
bibdate = "Fri Feb 5 07:01:55 MST 1999",
url = "http://www.acm.org:80/pubs/citations/journals/cacm/1999-42-2/p39-goldschlag/",
acknowledgement = ack-nhfb,
annote = "See the \url{www.onion-router.net}. Other methods to
achieve privacy on the net are discussed in other articles from this
CACM issue \cite{Reiter:1999:AWT,Gabber:1999:CYA,Reagle:1999:PPP}
and \cite{Benassi:1999:T}"
}
@Article{Grimley:1999:PIA,
author = {Michael J. Grimley and Brian D. Monroe},
title = {Protecting the integrity of agents: {An} exploration
into letting agents loose in an unpredictable world},
journal = {Crossroads - The ACM Student Magazine},
year = {1999},
OPTkey = {},
OPTvolume = {},
number = {5.4},
pages = {10--17},
month = {},
OPTnote = {},
annote = {A good and brief surver introduction into the issues
of security of agents (both protecting agents from
their execution environments and vice versa, with lots
of good references. A good staring point.}
}
@InProceedings{Halpern:1999:KAU,
author = {Joseph Y. Halpern and Aleta Ricciardi},
title = {A knowledge-theoretic analysis of uniform distributed
coordination and failure detectors},
booktitle = pro-podc99,
OPTcrossref = {},
OPTkey = {},
pages = {73--82},
year = {1999},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTmonth = {},
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {A discussion appears in \cite{Aguilera:1999:WFD}.}
}
@Article{Hennessy:1999:FSR,
author = "John Hennessy",
title = "The Future of Systems Research",
journal = "Computer",
volume = "32",
number = "8",
pages = "27--33",
month = aug,
year = "1999",
url = "http://www.computer.org/computer/co1999/r8027abs.htm;
http://dlib.computer.org/co/books/co1999/pdf/r8027.pdf",
annote = "A speculation on what will be and what should be the
subject of research and development in systems in the next
years. Interesting is that the author explicitly mentions
availability as a key issue and fault-tolerance as a key
mechanism. However, fault tolerance research must focus more on
gradual and dynamic mechanisms, not directly hiding fault evidence
but helping maintain availablity, for example like in the RAID
approach \cite{Patterson:1988:CRA}. A good reference for the
importance of fault tolerance research."
}
@Article{Hoffman:1999:PCL,
author = {Forrest Hoffman and William Hargove},
title = {Parallel computing with {Linux}},
journal = {Crossroads, the ACM student magazine},
year = {1999},
OPTkey = {},
volume = {6},
number = {1},
pages = {23--27},
OPTmonth = {},
OPTnote = {},
annote = {Gives a practical guide to installing a beowulf parallel
computing system at your home. Gives a lot of online references to
more information and is a good starting point for beowulf projects.}
}
@Article{Hurfin:1999:SFA,
author = "Michel Hurfin and Michel Raynal",
title = "A Simple and Fast Asynchronous Consensus Protocol
Based on a Weak Failure Detector",
journal = j-DC,
volume = "12",
number = "4",
pages = "209--223",
year = "1999",
abstract = "The Consensus problem is a fundamental paradigm for
fault-tolerant asynchronous systems. It abstracts a family of
problems known as Agreement (or Coordination) problems. Any
solution to consensus can serve as a basic building block for
solving such problems (e.g., atomic commitment or atomic
broadcast). Solving consensus in an asynchronous system is not
a trivial task: it has been proven (1985) by Fischer, Lynch
and Paterson that there is no deterministic solution in
asynchronous systems which are subject to even a single crash
failure. To circumvent this impossibility result, Chandra and
Toueg have introduced the concept of unreliable failure
detectors (1991), and have studied how these failure detectors
can be used to solve consensus in asynchronous systems with
crash failures. This paper presents a new consensus protocol
that uses a failure detector of the class $\Diamond{\cal
S}$. Like previous protocols, it is based on the rotating
coordinator paradigm and proceeds in asynchronous
rounds. Simplicity and efficiency are the main characteristics
of this protocol. From a performance point of view, the
protocol is particularly efficient when, whether failures
occur or not, the underlying failure detector makes no mistake
(a common case in practice). From a design point of view, the
protocol is based on the combination of three simple
mechanisms: a voting mechanism, a small finite state automaton
which manages the behavior of each process, and the
possibility for a process to change its mind during a round.",
annote = "Must be noted as one of the standard consensus
protocols amoung \cite{Chandra:1996:UFD} and
\cite{Schiper:1997:ECA,Schiper:1997:EEC}."
}
@Article{Jajodia:1999:SIW,
author = "Sushil Jajodia and Paul Ammann and Catherine D.
McCollum",
title = "Surviving Information Warfare Attacks",
journal = "Computer",
volume = "32",
number = "4",
pages = "57--63",
month = apr,
year = "1999",
coden = "CPTRB4",
ISSN = "0018-9162",
bibdate = "Thu Apr 1 07:09:15 MST 1999",
url = "http://www.computer.org/computer/co1999/r4057abs.htm;
http://dlib.computer.org/co/books/co1999/pdf/r4057.pdf",
annote = "Describes the dangers which information systems are
suspect to and the traditional methods of preventing them (fault
tolerance, database system management mechanisms). A realistic
alternative to these two approaches is described that is a mixture
of both, attacks and countermeasures are briefly described. While
the exact mechanisms remain rather superficial, this paper is
another example for the fact that security can also be seen as a
fault tolerance problem (\cite{Arora:1998:DMF} is cited directly)
with all the implications. See also \cite{Schneier:1998:CDV}."
}
@InProceedings{Jochim:1999:AGD,
author = {Markus Jochim},
title = {Automatic Generation of Diversified Program
Variants Optimized to Detect Hardware Faults},
booktitle = {Tenth European Workshop on Dependable Computing (EWDC-10)},
OPTcrossref = {},
OPTkey = {},
pages = {169--174},
year = {1999},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Vienna, Austria},
OPTmonth = {},
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
url = "http://www.cs.uni-essen.de/Fachgebiete/Depend/Papers/Joch99/",
annote = {Presents ideas on how to automatically introduce
code diversity into machine programs so that two distinct but
semantically equivalent processes can run in parallel (virtual
duplex system) and detect hardware errors with high probability.
Discusses practical considerations in the design of code
mutation rules like independence of addressing mode, overflow,
short code production etc.}
}
@InProceedings{Johansen:1999:NAP,
author = "Dag Johansen and Keith Marzullo and Fred B. Schneider
and Kjetil Jacobsen and Dmitrii Zagorodnov",
title = "{NAP}: Practical Fault-Tolerance for Itinerant
Computations",
booktitle = {Proceedings of the 19th IEEE International Conference
on Distributed Computing Systems},
OPTcrossref = {},
OPTkey = {},
pages = {180--189},
year = {1999},
editor = {Mohamed G. Gouda},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Austin, Texas},
month = jun,
OPTorganization = {},
publisher = pub-IEEE,
OPTnote = {},
OPTannote = {}
}
@Article{Jutla:1999:MBS,
author = "Dawn Jutla and Peter Bodorik and Catherine Hajnal and
Charles Davis",
title = "Making Business Sense of Electronic Commerce",
journal = "Computer",
volume = "32",
number = "3",
pages = "67--75",
month = mar,
year = "1999",
coden = "CPTRB4",
ISSN = "0018-9162",
bibdate = "Sat Mar 6 09:04:10 MST 1999",
url = "http://www.computer.org/computer/co1999/r3067abs.htm;
http://dlib.computer.org/co/books/co1999/pdf/r3067.pdf",
acknowledgement = ack-nhfb,
annote = "A good overview over the issues involved in adoption
and applying e-commerce in different fields of buisiness.
Buisiness models and application frameworks are presented."
}
@Article{Karaata:1999:SAB,
author = "Mehmet Hakan Karaata and Pranay Chaudhuri",
title = "A self-stabilizing algorithm for bridge finding",
journal = j-DC,
volume = "12",
year = "1999",
pages = "47--53",
annote = "Finds edges which partition the graph if they are removed.
Builds upon spanning tree algorithm by \cite{Huang:1992:SSA}."
}
@Article{Kelley:1999:HTB,
author = {Robert E. Kelley},
title = {How to be a star engineer},
journal = {IEEE Spectrum},
year = {1999},
OPTkey = {},
volume = {36},
number = {10},
pages = {51--58},
month = oct,
OPTnote = {},
annote = {Reports on a study about engineer work performance
and discusses many misconceptions. Argues that star
performers are normal workers who are treated in
a special way. keyword: Soft skills, also for managers.}
}
@MastersThesis{Kloppenburg:1999:EPS,
author = {Sven Kloppenburg},
title = {Entdecken globaler {Pr\"adikate} in verteilten Systemen
mit {Anhalteausf\"allen}},
school = {Technische Universit\"at Darmstadt, Fachbereich Informatik,
Fachgebiet Betriebssysteme},
year = {1999},
OPTkey = {},
type = {Diplomarbeit},
OPTaddress = {},
month = sep,
note = {DA-BS-1999-02},
annote = {Results published in \cite{Gaertner:2000:CDG}. A cite for
the term ``Anhalteausfall'', German for ``crash''.}
}
@PhdThesis{Kulkarni:1999:CBD,
author = {Sandeep S. Kulkarni},
title = {Component Based Design of Fault-Tolerance},
school = {Department of Computer and Information Science, The Ohio
State University},
year = {1999},
OPTkey = {},
OPTtype = {},
OPTaddress = {},
OPTmonth = {},
OPTnote = {},
annote = {Several papers contain results of this thesis, e.g.
\cite{Arora:1998:CDM}.}
}
@InProceedings{Kulkarni:1999:CSC,
author = {Sandeep S. Kulkarni and John Rushby and Natarajan Shankar},
title = {A Case-Study in Component-Based Mechanical Verification
of Fault-Tolerant Programs},
booktitle = pro-wss99,
OPTcrossref = {},
OPTkey = {},
pages = {33--40},
year = {1999},
editor = {Anish Arora},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Austin, TX, USA},
month = jun,
OPTorganization = {},
publisher = pub-IEEE,
OPTnote = {},
OPTannote = {}
}
@Article{Lange:1999:SGR,
title = "{Seven good reasons for mobile agents}",
author = "Danny B. Lange and Mitsuru Oshima",
journal = "Communications of the ACM",
volume = "42",
number = "3",
month = mar,
year = "1999",
pages = "88--89",
url = "http://www.acm.org/pubs/articles/journals/cacm/1999-42-3/p88-lange/p88-lange.pdf",
annote = "While the title states otherwise, the reasons presented
here are to me rather non-reasons: 1. they reduce the
network load, 2. they overcome network latency,
3. they encapsulate protocols, 4. they execute
asynchronously and autonomously, 5. they adapt
dynamically, 6. they are naturally heterogeneous,
7. they are robust and fault-tolerant. I find the way
in which the individual reasons are presented very
non-convincing, probably because the exposition is so
brief. Some applications of agents are given
(e-commerce, personal assiatance, secure brokering,
distributed information retrieval, ...)."
}
@Article{Lewis:1999:BCM,
author = "Ted Lewis",
title = "Binary Critic: Mainframes Are Dead, Long Live
Mainframes",
journal = "Computer",
volume = "32",
number = "8",
pages = "104, 102--103",
month = aug,
year = "1999",
url = "http://dlib.computer.org/co/books/co1999/pdf/r8104.pdf",
annote = "Argues that mainframes are experiencing a revival because
of their unmatched reliability. Gives some figures: Cost of downtime
ranges from \$1000 per minute for simple e-mail to \$13000 per
minute for enterprise resource planning applications. Also: An IBM
S/390 sysplexed mainframe only has 10 minutes outage per year, while
a windows-NT-based PC has about 224.5 hours outage per year (table
1)."
}
@TechReport{Mantel:1999:CSM,
author = {Heiko Mantel and Felix C. {G\"artner}},
title = {A case study in the mechanical verification of
fault tolerance},
institution = {Department of Computer Science, Darmstadt University
of Technology },
year = {1999},
OPTkey = {},
OPTtype = {},
number = {TUD-BS-1999-08},
OPTaddress = {},
month = nov,
OPTnote = {},
OPTannote = {}
}
@InProceedings{Mostefaoui:1999:SCU,
author = {Achour Mostefaoui and Michel Raynal},
title = {Solving Consensus Using Chandra-Toueg's Unreliable
Failure Detectors: a General Quorum-Based Approach},
booktitle = {Proceedings of the 13th International Symposium on
Distributed Computing (DISC)},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {1999},
OPTeditor = {},
OPTvolume = {},
number = {1693},
series = ser-LNCS,
address = {Bratislava, Slovak Republik},
month = sep,
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {Uses dynamic quorums to define when a value may be
decided.}
}
@Article{Oberg:1999:WMP,
author = "James Oberg",
journal = "IEEE Spectrum",
number = "12",
pages = "34--39",
title = "Why the Mars Probe went off course",
volume = "36",
year = "1999",
crindex = "Journal",
location = "http://www.spectrum.ieee.org/spectrum/dec99/features/mars.html",
annote = "A detailed report on why the mars climate orbiter crashed onto
the surface of Mars in 1999. Popularly believed to be only an
error in taking metric and British measurement units, the
article shows that the orbiter failed to follow the right
trajectory also partly because of severe management mistakes
and sensor inaccuracies: Uncertainty lead to assuming
good things instead of bad things, so instead of a safe
fly-by the orbiter must have crashed onto the surface of
Mars (even that is not sure)."
}
@InProceedings{Pagnia:1999:EGP,
author = {Henning Pagnia and Holger Vogt},
title = {Exchanging goods and payment in electronic business
transactions},
booktitle = {Proceedings of the Third European Research Seminar on
Advances in Distributed Systems (ERSADS)},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {1999},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Madeira Island, Portugal},
month = apr,
OPTorganization = {},
OPTpublisher = {},
note = {Proceedings distributed as copies at the conference.},
annote = {similar to \cite{Vogt:1999:FAE} but in English; I have
an electronic copy in literature/pagnia-ersads.ps
A similar protocol has appeared in \cite{Zhou:1996:FNP},
a shorter presentation is \cite{Schneider:1998:FAN}.}
}
@TechReport{Pagnia:1999:IFE,
author = {Henning Pagnia and Felix C. {G\"artner}},
title = {On the impossibility of fair exchange without a trusted
third party},
institution = {Darmstadt University of Technology, Department of
Computer Science},
year = {1999},
OPTkey = {},
OPTtype = {},
number = {TUD-BS-1999-02},
address = {Darmstadt, Germany},
month = mar,
url = {\url{http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-1999-02.ps.gz}},
note = {Available at \url{http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-1999-02.ps.gz}. A substantially revised version is available
upon request from the authors.},
OPTannote = {}
}
@InProceedings{Pedone:1999:GB,
year = "1999",
title = "Generic Broadcast",
author = "F. Pedone and A. Schiper",
booktitle = "Proceedings of the 13th International Symposium
on Distributed Computing (DISC'99)",
month = sep,
url = "http://lsewww.epfl.ch/Documents/acrobat/PS99c.pdf",
annote = "see also \cite{Aguilera:2000:TGB} [to get]"
}
@Article{Reagle:1999:PPP,
author = "Joseph Reagle and Lorrie Faith Cranor",
title = "The platform for privacy preferences",
journal = "Communications of the ACM",
volume = "42",
number = "2",
pages = "48--55",
month = feb,
year = "1999",
coden = "CACMA2",
ISSN = "0001-0782",
bibdate = "Fri Feb 5 07:01:55 MST 1999",
url = "http://www.acm.org:80/pubs/citations/journals/cacm/1999-42-2/p48-reagle/",
acknowledgement = ack-nhfb,
annote = "PPP is a way of formally stating privacy policies within
web pages and making privacy practices compareable and automatically
manageable. related work ist the TRUSTe seal \cite{Benassi:1999:T}."
}
@Article{Reicherzer:1999:AUA,
author = {Judith Reicherzer},
title = {{Angeklickt und abgezockt}},
journal = {Die Zeit},
year = {1999},
OPTkey = {},
OPTvolume = {},
number = {34},
pages = {20--21},
month = "19.~" # aug,
OPTnote = {},
annote = {Gute Motivation fuer die Notwendigkeit von Fair Exchange.}
}
@Article{Reiter:1999:AWT,
author = "Michael K. Reiter and Aviel D. Rubin",
title = "Anonymous {Web} transactions with crowds",
journal = "Communications of the ACM",
volume = "42",
number = "2",
pages = "32--48",
month = feb,
year = "1999",
coden = "CACMA2",
ISSN = "0001-0782",
bibdate = "Fri Feb 5 07:01:55 MST 1999",
url = "http://www.acm.org:80/pubs/citations/journals/cacm/1999-42-2/p32-reiter/",
acknowledgement = ack-nhfb,
annote = "One of the prominent projects to achieve anonymity on the
web. The approach of crowds uses a nondeterministic forwarding
service between clients within a crowd. A web server receiving a
request cannot know whether the request originated from the sender
or from some other member of the crowd. The concept can even provide
privacy against a number of collaborating members of the crowds
itself. Disadvantages of crowds are (amoung others) the increased
retrieval latency, and having to protect the confidentiality of the
message against other crowd members. Compared against the anonymizer
fo example, crowds has no single point where provacy can be
compromised. Crowds has been implemented and deployed in the
US. Some practical issues are also discussed and references to
research papers are given. Other methods to achieve privacy are
onion routing \cite{Goldschlag:1999:OR}, anonymizer
(\url{www.anonymizer.com}), LPWA \cite{Gabber:1999:CYA}. Relevant
other articles are \cite{Reagle:1999:PPP,Benassi:1999:T}."
}
@InCollection{Roth:1999:MPC,
author = {V. Roth},
title = {Mutual Protection of Co-operating Agents},
booktitle = "Secure Internet Programming: Security Issues
for Mobile and Distributed Objects",
pages = "277--287",
crossref = "Vitek:1999:SIP",
annote = "ref von Uwe Wilhelm"
}
@Misc{Semper:1999:ASA,
OPTkey = {},
OPTauthor = {},
editor = "SEMPER Consortium and IBM {Z\"urich}",
title = {Advanced Services, Architecture and Design},
howpublished = {SEMPER Deliverable D10; La Gaude},
month = mar,
year = {1999},
note = {Available at http://www.semper.org/deliver/d10/d10.ps.gz},
annote = {Part of the final report on the SEMPER project.}
}
@TechReport{Sergent:1999:FDI,
author = {Nicole Sergent and Xavier {D\'efago} and {Andr\'e} Schiper},
title = {Failure Detectors: implementation issues and impact on
consensus performance},
institution = {{\'Ecole} Polytechnique {F\'ed\'erale} de Lausanne,
Switzerland},
year = {1999},
OPTkey = {},
OPTtype = {},
number = {SSC/1999/019},
OPTaddress = {},
OPTmonth = {},
OPTnote = {},
annote = {This paper presents several different ways to implement
crash failure detectors and measures the impact of these
implementations on the performance of the Chandra Toueg Consensus
algorithm \cite{Chandra:1996:UFD}. The different implementations
are: heart beat (a node periodically sends `alive' messages),
interrogation (nodes keep exchanging `are you alive', `alive'
messages), and two optimizations: use only critical messages to do
request response type failure detection, sending heart beats only
between critical requests/respones. The simulation of the
consensus algorithm shows that the time out used to implement
suspicions together with the period interval of sending failure
detector messages have optimal combinations regarding the
termination time of the algorithm. It is argued that using
failure detectors does not relieve the engineer to consider
timing issues (also indicated by \cite{Fetzer:1999:CTA}).}
}
@InProceedings{Theel:1999:EPC,
author = {Oliver Theel and Felix C. {G\"artner}},
title = {An Exercise in Proving Convergence through Transfer
Functions},
booktitle = pro-wss99,
OPTcrossref = {},
OPTkey = {},
pages = {41--47},
year = {1999},
editor = {Anish Arora},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Austin, TX},
month = jun,
OPTorganization = {},
publisher = pub-IEEE,
OPTnote = {},
annote = {A simpler example than in \cite{Theel:1998:OPS}, still
not distributed, but from an algorithms viewpoint.}
}
@InProceedings{Theel:1999:OPT,
author = {Oliver Theel and Felix C. {G\"artner}},
title = {On proving termination through transfer functions},
booktitle = {Proceedings of the 4th International Workshop on
Termination},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {1999},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Dagstuhl, Germany},
month = may,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {}
}
@InCollection{Verissimo:1999:TDS,
author = {Paulo Ver{\'\i}ssimo and Michel Raynal},
title = {Time in distributed system models and algorithms},
booktitle = {Advances in Distributed Systems, Part I -- Distributed
Algorithms},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
publisher = {ESPRIT Broadcast, Springer-Verlag},
year = {1999},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTtype = {},
OPTchapter = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
note = {to appear, available at \url{http://www.navigators.di.fc.ul.pt/archive/TimeBcast.ps.gz}},
annote = {Summarizes a great deal of work already published
elsewhere. First briefly sketches the controversy synchrony
vs. asynchrony and states that today, timeliness constraints in
terms of real-time are increasingly important, especially in
dependable systems (flight control) or QoS applications. This leads
to the quasi synchronous system model, which is then briefly
elaborated on (for a more detailed explanation, see
\cite{Almeida:1998:QSA}). Timing failure detectors (as
generalizations of crash failure detectors \cite{Chandra:1996:UFD})
are presented, motivated and implemented in the quasi synchronous
setting. Timing failure detectors are complete in a safety sense
(i.e., they detect timing failures within a known real-time
bound). Such failure detectors can be generalized to QoS failure
detectors. Then the CesiumSpray system for global clock
synchronozation is presented (a hierachical and hybrid one to
exploit the characteristics of different WAN/LAN settings), then
follow some generalizations of causal or temporal precendence orders
which also take events outside of the system into account and try to
order them (I did not read that too carefully). Finally, some
protocols to achieve such order are presented.}
}
@Book{Vitek:1999:SIP,
editor = "J. Vitek and C. Jensen",
title = "Secure Internet Programming: Security Issues
for Mobile and Distributed Objects",
volume = "1603",
publisher = pub-SV,
address = "New York, NY, USA",
year = "1999",
series = "Lecture Notes in Computer Science",
keywords = "Computer security; Electronic data processing ---
Distributed processing --- Security; Intelligent agents
(Computer software) --- Security measures; measures;
Mobile agents (Computer software)",
}
@InProceedings{Vogt:1999:FAE,
author = {Holger Vogt and Henning Pagnia},
title = {{Fairer Austausch beim elektronischen Einkauf im Internet}},
booktitle = {Proceedings of the 6th DFN-CERT Workshop ``Sicherheit
in vernetzten Systemen''},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {1999},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Hamburg, Germany},
month = mar,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {in German},
annote = {Ueberblick ueber Protokolle zum fairen Austausch von
Ware gegen Geld in unsicheren Netzen wie dem Internet. Diskussion
der Begriffe starker und schwacher Fairness von Asokan und
der spaerlichen Literatur zu diesem Thema. Vorstellung einiger
Protokolle zur starken Fairness mit Vermittlern: (1) mit aktivem
Vermittler, (2) optimistisch mit generierbaren Waren, (3) optimistisch
mit Zahlungswiderrufsmoeglichkeit, (4) optimistisch mit generierbarer
Ware und Widerrufbarkeit. Am Ende Diskussion von Anonymitaet, die
wenig Auswirkungen auf die vorgestellten Protokolle hat.}
}
@incollection{Wilhelm:1999:ITT,
year = {1999},
address = {New York, NY, USA},
pages = {471--491},
series = {Lecture Notes in Computer Science},
title = {Introducing Trusted Third Parties to the Mobile Agent Paradigm},
author = {U. G. Wilhelm and S. Staamann and L. Butty\`an},
booktitle = {Secure Internet Programming: Security Issues for Mobile and Distributed Objects},
publisher = pub-SV,
editor = {J. Vitek and C. Jensen},
keywords = {Security},
volume = {1603},
annote = "[got it?]"
}
@PhdThesis{Wilhelm:1999:TAP,
author = {U. G. Wilhelm},
title = {A Technical Approach to Privacy based on
Mobile Agents protected by Tamper-resistant Hardware},
school = {{\'E}cole Polytechnique F{\'e}d{\'e}rale de Lausanne},
year = 1999,
address = {Switzerland},
number = {1961},
month = may
}
@Article{Aguilera:2000:FDC,
author = "Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg",
title = "Failure Detection and Consensus in the Crash Recovery
Model",
journal = "Distributed Computing",
year = "2000",
alt-url = "http://www.cs.cornell.edu/home/sam/FDpapers/crash-recovery-finaldcversion.ps",
url = "http://link.springer.de/link/service/journals/00446/papers/0013002/00130099.pdf",
pages = "99--125",
volume = "13",
number = "2",
month = apr,
abstract = "We study the problems of failure detection and
consensus in asynchronous systems in which processes
may crash and recover, and links may lose messages. We
first propose new failure detectors that are
particularly suitable to the crash-recovery model. We
next determine under what conditions stable storage is
necessary to solve consensus in this model. Using the
new failure detectors, we give two consensus algorithms
that match these conditions: one requires stable
storage and the other does not. Both algorithms
tolerate link failures and are particularly efficient
in the runs that are most likely in practice - those
with no failures or failure detector mistakes. In such
runs, consensus is achieved within $3 \delta$ time and
with 4 n messages, where $\delta$ is the maximum
message delay and n is the number of processes in the
system.",
annote = "Description in \cite{Aguilera:1998:FDCTR}."
}
@Article{Aguilera:2000:QRC,
author = {Marcos Kawazoe Aguilera and Wei Chen and Sam Toueg},
title = {On quiescent reliable communication},
journal = {SIAM Journal on Computing},
year = {2000},
OPTkey = {},
volume = {29},
number = {6},
pages = {2040--2073},
month = dec,
url = {\url{http://www.cs.cornell.edu/Info/People/sam/FDpapers/ACTquiescent-SIAM.ps}.},
annote = {Quiescent algorithms are those that eventually stop
sending messages. Quiescent reliable communication protocols are
algorithms like reliable broadcast or uniform reliable broadcast
that are quiescent. The authors study quiescent reliable communication
algorithms in systems where processes may crash and links are fair.
A link is fair if it does not introduce spurious messages and if
a message which is sent infinitely often is received infinitely often.
In such systems it is impossible to implement quiescent reliable
communication without failure detectors. Why? Reliable communication
means that whenever nodes $s$ and $r$ are correct and $s$ sends a
message to $r$, then $r$ must eventually receive the message. However,
$s$ must achieve this by sending only finitely many messages. Any
number of messages may be lost due to the fair channels, and so
$s$ can never be sure whether $r$ has crashed or is alive if it does
not receive an acknowledgement. Failure detection can help in
this case. However, the usual failure detectors which output
lists of suspects are not very useful. Any such bounded failure
detector that helps solve quiescent reliable communication is
at least as powerful as the eventually perfect failure detector.
Why? The bound on the output of the failure detector implies that
eventually it will keep on repeating the same (limit) values again
and again. The existence of a quiescent communication primitive
however implies that the limit value is in fact the set of correct
processes. Thus, using this failure detector it is possible to
emulate an eventually perfect failure detector. Next, the authors
introduce a new type of failure detector called Heartbeat which
has an unbounded output range. The range is a vector of elements
(one for each process, or neighboring process) that keeps on
increasing without bound as long as that process is alive. Thus,
the failure detector can now be used to keep the system going.
To achieve quiescence it is now possible to take a change in
the heartbeat failure detector as the cause of a retransmission
unless an acknowledgement has been received. In a sense, the
decision whether to stop or not is transfered into the failure
detector. Obviously, heartbeat is implementable in asynchronous
systems (the authors give an implementation), and naturally,
such an implementation cannot be quiescent. In systems where
heartbeat is available quiescent reliable communication can be
achieved and so fair links lose their danger: many algorithms that
rely on reliable links can now be transformed into environments
with lossy links (fair ones, not fair lossy \cite{Basu:1996:SRL}),
whenever Heartbeat is available. It must however be checked
whether reliable can be substituted with ``quasi-reliable''
communication (quasi-reliable is equal to reliable if processes
do not crash during quasi-reliable sending). The concluding
remarks touch some other interesting points: (1)
message buffering can be limited by at some point excluding
suspected processes from the active group (i.e. explicitly
crashing them). The heartbeat implementation will however
ensure that no messages are sent to them long before they are
excluded. (2) a terminating protocol is quiescent, but a
quiescent protocol need not terminate. A layering technique
is proposed that has failure detection as a basic mechanism
(non-quiescent, non-terminating), building upon failure detection
is reliable communication (quiescent, non-terminating), and
on top can be terminating applications like consensus. (3)
fair lossy \cite{Basu:1996:SRL} is opposed to fair channels,
stating that the results also hold for fair lossy links, only
that expensive piggybacking is required in this case. (4) failure
detectors with finite output range have limitations (this is
obvious from the fact that quiescent reliable communication
needs an eventually perfect failure detector if the output range
is bounded and such a detector is impossible to implement in
asynchronous systems). However, when comparing failure detectors
it is necessary to see whether the transformation is quiescent
too. }
}
@InProceedings{Aguilera:2000:TGB,
author = {Marcos Kawazoe Aguilera and Carole Delporte-Gallet and
Hugues Fauconnier and Sam Toueg},
title = {Thrifty generic broadcast},
booktitle = {Proceedings of the 14th International Symposium on
Distributed Computing (DISC)},
OPTcrossref = {},
OPTkey = {},
pages = {268--282},
year = {2000},
OPTeditor = {},
OPTvolume = {},
number = {1914},
series = ser-LNCS,
address = {Toledo, Spain},
month = oct,
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {Looks at atomic broadcasts where the total order may
be relaxed. Implementations of such operators can of course rely
on atomic broadcast, but this is unsatisfactory. The scrictness
property proposed by Pedone and Schiper (generic broadcast) is
not sufficient. In this paper, new definitions for a broadcast
to be a good implementation of generic broadcast are proposed.
The definition is based on the notion of using an oracle like
a failure detector. A generic broadcast implementation is good
(=thrifty) if the implementation uses the oracle only when
conflicting messages need to be processed (a more formal definition
is: if there is a time after which only non-conflicting messages
are brodcast, then there is a time after which the oracle is
not used anymore). The oracle used is in fact atomic broadcast.}
}
@InProceedings{Arora:2000:RVC,
author = "Anish Arora and Sandeep Kulkarni and Murat Demirbas",
title = "Resettable vector clocks",
booktitle = "Proceedings of the Nineteenth Annual ACM
Symposium on Principles of Distributed Computing (PoDC)",
pages = "269--278",
year = "2000",
annote = "Resettable vector clocks are vector clocks that use bounded
state space. This paper identifies assumptions under which vector
clocks may be replaced by resettable vector clocks in an application
without endangering its correctness. Then resettable vector clocks
are made stabilizing fault tolerant using detectors and correctors
(a global reset is fired on local detection)."
}
@InCollection{Arora:2000:S,
author = {Anish Arora},
title = {Stabilization},
booktitle = {Encyclopedia of Distributed Computing},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
publisher = {Kluwer},
year = {2000},
editor = {Partha Dasgupta and Joseph E. Urban},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTtype = {},
OPTchapter = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
url = "\url{ftp://ftp.cis.ohio-state.edu/pub/anish/papers/stb.ps.gz}",
OPTnote = {},
OPTannote = {}
}
@Article{Bernhardt:2000:RDR,
author = {Ute Bernhardt},
title = {{Reiten auf der Risikowelle (Editorial zum Sonderheft
zum Thema ``Verletzlichkeit der
Informationsgesellschaft'')}},
journal = {FIfF Kommunikation},
year = {2000},
OPTkey = {},
OPTvolume = {},
number = {3},
pages = {3},
month = sep,
OPTnote = {},
annote = {Editorial zum Sonderheft. Im Sonderheft selbst sind
ausnahmslos lesenswerte Artikel beispielsweise ueber
kritische Infrastrukturen \cite{Schulzki:2000:KI},
Cybercime, Jugendschutz im
Internet und Vertrauen. Interessant ist, dass etwa
zur selben Zeit eine thematisch aehnliche Ausgabe von
IEEE Computer erscheint \cite{Jones:2000:CBS}.}
}
@InProceedings{Boichat:2000:RBC,
author = {Romain Boichat and Rachid Guerraoui},
title = {Reliable Broadcast in the Crash-Recovery Model},
booktitle = pro-srds2000,
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {2000},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {N\"urnberg, Germany},
month = oct,
OPTorganization = {},
publisher = pub-IEEE,
OPTnote = {},
OPTannote = {}
}
@Article{Bowen:2000:ESC,
author = {Jonathan Bowen},
title = {The ethics of safety-critical systems},
journal = j-CACM,
year = {2000},
OPTkey = {},
OPTvolume = {43},
OPTnumber = {4},
OPTpages = {91--97},
OPTmonth = apr,
OPTnote = {},
OPTannote = {Presents sins and truths of safety critical systems
engineering. Explicitly discusses formal methods.}
}
@TechReport{Brasileiro:2000:COC,
author = {Francisco Brasileiro and {Fab\'\i{}ola} Greve and Achour
{Most\'efaoui} and Michel Raynal},
title = {Consensus in one communication step},
institution = {IRISA},
year = {2000},
OPTkey = {},
OPTtype = {},
number = {PI-1321},
address = {Rennes, France},
OPTmonth = {},
OPTnote = {},
annote = {[to read]}
}
@InProceedings{Breitling:2000:MFD,
author = {Max Breitling},
title = {Modeling faults of distributed, reactive systems},
booktitle = {Formal Techniques in Real-Time and Fault-Tolerant
Systems, 6th International Symposium (FTRTFT 2000)
Proceedings},
OPTcrossref = {},
OPTkey = {},
pages = {58--69},
year = {2000},
editor = {Mathai Joseph},
OPTvolume = {},
number = {1926},
series = ser-LNCS,
address = {Pune, India},
month = sep,
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {Models faults as addition of variables and transitions in
a special formalism (Fokus) which supports compositionality
and refinement.}
}
@Article{Buschek:2000:M4W,
author = {Oliver Buschek},
title = {{Mit dem 486er zur Raumstation}},
journal = {Chip},
year = {2000},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
pages = {92--98},
month = feb,
OPTnote = {},
annote = {Leicht verstaendlicher Einstieg in Themen der Fehlertoleranz
im Weltraum. Fokus auf ISS: Dort sind 6fach redundante
Schuhschachteln, die Byzantinisches Agreement machen, drin.
Gibt auch Hinweise auf Webadressen der Nasa und ESA.}
}
@Misc{Cachin:2000:RMU,
OPTkey = {},
author = {C. Cachin and J. Camenisch and M. Dacier and Y. Deswarte
and J. Dobson and D. Horne and K. Kursawe and J.-C. Laprie
and J.-C. Lebraud and D. Long and T. McCutcheon and
J. {M\"uller} and F. Petzold and B. Pfitzmann and D. Powell
and B. Randell and M. Schunter and V. Shoup and
P. Ver{\'\i}ssimo and G. Trouessin and R. J. Stroud and
M. Waidner and I. S. Welch},
title = {Reference Model and Use Cases},
OPThowpublished = {},
month = aug,
year = 2000,
note = {Deliverable D1 of the MAFTIA project \cite{MAFTIA}.},
OPTannote = {}
}
@InProceedings{Cachin:2000:ROC,
author = {Christian Cachin and Klaus Kursawe and Victor Shoup},
title = {Random oracles in constantinople: practical asynchronous
{Byzantine} agreement using cryptography},
booktitle = {Proceedings of the Symposium on Principles of Distributed
Computing},
OPTcrossref = {},
OPTkey = {},
pages = {123--132},
year = {2000},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Portland, Oregon},
OPTmonth = {},
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {Looks at randomized Byzantine agreement and presents an
optimistic solution using a randomized and cryptographically secure
coin toss. This is a good example on what and where fault-tolerance
can learn from cryptography.}
}
@InProceedings{Charron-Bost:2000:RSL,
author = {Bernadette Charron-Bost and Sam Toueg and Anindya Basu},
title = {Revisiting safety and liveness in the context of failures},
booktitle = {Proceedings of CONCUR2000 -- Concurrency Theory,
11th Int. Conference},
OPTcrossref = {},
OPTkey = {},
pages = {552--565},
year = {2000},
editor = {C. Palamidessi},
OPTvolume = {},
number = {1877},
series = ser-LNCS,
address = {University Park, PA},
month = aug,
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {Agreement in consensus is defined as ``no two correct
processes decide differently''. Against common belief, this is
a liveness property in systems where processes may crash. This
is because if two processes have decided differently, then
agreement can still be achieved if one of them crashes. The
authors define pure safety and pure liveness meaning that
safety and liveness hold without ``the help or non-help of
failures''. Pure liveness means that something good can still
happen without the help of failures. Pure safety means that
executions which do not satisfy the property must contain
failures to satisfy the property. Pure versions are strictly
weaker than the original versions. The authors define a
property transformer `Pure' that `makes a property pure' by
removing all executions which contain undesirable partial
runs. Pure agreement demands that no two alive processes
decide differently and comes closer to our intuition of
agreement in consensus. Pure agreement is stronger than
uniform agreement but weaker than agreement. Shows that every
pure property is the intersection of a pure safety and a pure
liveness property. A startling paper which demands more
investigation!}
}
@InProceedings{Charron-Bost:2000:SSP,
author = "Bernadette Charron-Bost and Rachid Guerraoui and
{Andr\'e} Schiper",
title = "Synchronous System and Perfect Failure Detector:
Solvability and Efficiency Issues",
booktitle = "International Conference on Dependable Systems and
Networks (IEEE Computer Society)",
year = "2000",
annote = "Looks at the relation between the synchronous system model
and the asynchronous model augmented with perfect failure detectors.
They show that there are problems which are solvable in synchronous
systems but are unsolvable in asynchronous systems with perfect failure
detectors. Hence, both models are not equivalent in this respect.
This is because failure detectors give no information on the
causal relation between the crash event and other events on the
crashed process. This means that you cannot decide whether there
is still a message in transit coming from the crashed process
or not. If you want to base a decision on this fact you have
the same dilemma as in FLP \cite{Fischer:1985:IDC}. But if it
comes to consensus, both models are ok because you can solve
this problem in both. However, in synchronous systems algorithms
can be constructed with a lower latency degree \cite{Schiper:1997:ECA}
so more efficient solutions are possible in the synchronous model."
}
@InProceedings{Chen:2000:QOS,
author = {Wei Chen and Sam Toueg and Marcos Kawazoe Aguilera},
title = {On the quality of service of failure detectors},
booktitle = {Proceedings of the International Conference on
Dependable Systems and Networks (DSN 2000)},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {2000},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {New York},
month = jun,
OPTorganization = {},
publisher = pub-IEEE,
OPTnote = {},
annote = {In a system model where message delays and losses follow
some probability distribution the authors study performance metrics
regarding the accuracy and completeness of the failure detectors
which were introduced in the time free model \cite{Chandra:1996:UFD}.
Metrics concerning completeness are the detection time (i.e. the
time between the crash and the detection of the crash). Metrics
concerning accuracy are mistake recurrence time (the time between
two successive mistakes) and mistake duration (the time it takes
to correct a mistake). Other accuracy metrics can be derived from
them (average mistake rate, query accuracy probability, good period
duration, forward good period duration). An algorithm is presented
which achieves optimality concerning some metrics and is based
on synchronzed clocks: a timeout is started not when a hartbeat
arrives but at certain freshness points which are at equal intervals
at both processes (with a message delay difference). Discusses
how to tune the parameters of the algorithm to perform nearly optimal
and presents some ideas concerning adaptivity. Gives an overview
over other failure detection approaches in the literature.}
}
@Article{Crawford:2000:BNP,
author = {Gregory P. Crawford},
title = {A bright new page in portable displays},
journal = {IEEE Spectrum},
year = {2000},
OPTkey = {},
volume = {37},
number = {10},
pages = {40--46},
month = oct,
OPTnote = {},
annote = {Gives insight in new display technology aka smart paper.
Presents some fascinating photos of a cholestoric LCD
display of Kent Displays Inc., Kent, Ohio, which reflective
(needs no back light) and does not need power to hold
the image. Also describes the technologies behind this
display and Gyricon (Xerox) and E ink.}
}
@PhdThesis{Defago:2000:ARP,
author = {Xavier {D\'efago}},
title = {Agreement-related problems: from semi-passive replication
to totally ordered broadcast},
school = {{\'Ecole Polytechnique F\'ed\'erale de Lausanne}},
year = {2000},
OPTkey = {},
OPTtype = {},
address = {Lausanne, Switzerland},
OPTmonth = {},
note = {Thesis number 2229},
OPTannote = {}
}
@Article{Ditlea:2000:PCG,
author = {Steve Ditlea},
title = {The {PC} goes ready-to-wear},
journal = {IEEE Spectrum},
year = {2000},
OPTkey = {},
volume = {37},
number = {10},
pages = {34--39},
month = oct,
OPTnote = {},
annote = {This is more a market survey of wearables, presenting
display technology, prototypes (Xybernaught, IBM etc)
and e.g. Twiddler chorded keyboard. For a visionary
article see \cite{Billinghurst:1999:WDN}. }
}
@Book{Dolev:2000:SS,
author = {Shlomi Dolev},
ALTeditor = {},
title = {Self-Stabilization},
publisher = {MIT Press},
year = {2000},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
OPTannote = {}
}
@InProceedings{Echtle:2000:FFM,
author = {Klaus Echtle and Asif Masum},
title = {A fundamental failure model for fault-tolerant protocols},
booktitle = {Proceedings of the IEEE International Computer
Performance and Dependability Symposium (IPDS2K)},
OPTcrossref = {},
OPTkey = {},
pages = {69--78},
year = {2000},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Chicago, IL},
OPTmonth = {},
OPTorganization = {},
publisher = pub-IEEE,
OPTnote = {},
annote = {See also \cite{Echtle:1999:UCB,Masum:2000:NCB}. A
more elaborate description is attached to the entry of
\cite{Echtle:1999:UCB}.}
}
@InProceedings{Gaertner:2000:CDG,
author = {Felix C. G\"artner and Sven Kloppenburg},
title = {Consistent Detection of Global Predicates Under a
Weak Fault Assumption},
booktitle = pro-srds2000,
OPTcrossref = {},
OPTkey = {},
pages = {94--103},
year = {2000},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {N\"urnberg, Germany},
month = oct,
OPTorganization = {},
publisher = pub-IEEE,
OPTnote = {},
OPTannote = {}
}
@TechReport{Gaertner:2000:RIS,
author = {Felix C. {G\"artner} and Hagen {V\"olzer}},
title = {Redundancy in space in fault-tolerant systems},
institution = {Department of Computer Science,
Darmstadt University of Technology},
year = {2000},
OPTkey = {},
OPTtype = {},
number = {TUD-BS-2000-06},
address = {Darmstadt, Germany},
month = jul,
url = "\url{http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-2000-06.ps.gz}",
OPTnote = {},
OPTannote = {}
}
@InProceedings{Hiller:2000:EAD,
author = {Martin Hiller},
title = {Executable assertions for detecting data errors in
embedded control systems},
OPTcrossref = {},
OPTkey = {},
booktitle = {Proceedings of the International Conference on Dependable
Systems and Network (DSN 2000)},
pages = {24--33},
year = {2000},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTmonth = {},
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
OPTannote = {}
}
@InProceedings{Huang:2000:TFP,
author = {Shing-Tsaan Huang},
title = {The fuzzy philosophers},
booktitle = {Proceedings of the 15th IPDPS 2000 Workshops},
OPTcrossref = {},
OPTkey = {},
pages = {130--136},
year = {2000},
editor = {J. Rolim et al.},
volume = {1800},
OPTnumber = {},
series = ser-LNCS,
address = {Cancun, Mexico},
month = may,
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {Generalization of the dining philosophers and a
self-stabilizing solution.}
}
@Article{Hutter:2000:AII,
author = {Reinhard Hutter},
title = {{Angriffe auf Informationstechnik und Infrastrukturen --
Realit\"at oder Science Fiction?}},
journal = {Aus Politik und Zeitgeschichte},
year = {2000},
OPTkey = {},
volume = {41--42},
OPTnumber = {},
pages = {31--38},
OPTmonth = {},
OPTnote = {},
annote = {Gute Einfuehrung in und Referenz zu kritischen
Infrastrukturen, eher aus allgemeinverstaendlicher
und politikwissenschaftlicher Sicht.}
}
@Article{Jones:2000:CBS,
author = {Anita Jones},
title = {The challenge of building survivable information-intensive
systems (introduction to special issue on ``critical
infrastructures'')},
journal = {IEEE Computer},
year = {2000},
OPTkey = {},
volume = {33},
number = {8},
pages = {39--43},
month = aug,
OPTnote = {},
annote = {A German Journal with similar directions appeared at about
the same time \cite{Bernhardt:2000:RDR}.}
}
@InProceedings{Karjoth:2000:SMA,
author = {G\"{u}nter Karjoth},
title = "Secure Mobile Agent-Based Merchant Brokering in
Distributed Marketplaces",
booktitle = asama2000,
pages = "44--56",
year = 2000,
address = "Zurich, Switzerland",
month = sep,
volume = "1882",
series = ser-LNCS,
publisher = pub-SV,
keyword = "agents, e-commerce, security, mobile agent",
abstract = {Cooperating merchants establish a distributed
marketplace under the auspices of an independent
market authority. Each merchant's server is equipped
with a trusted device, a smart card for example,
provided by the market authority. The market
authority plays the role of a trusted third party
for the customer as well as for the merchants. This
paper describes protocols that prevent the malicious
alteration of the data collected by visiting mobile
agents roaming through the marketplace without being
detectable by subsequent servers or by the owner of
the agent upon its return. Another protocol makes
the trusted device a secure execution platform for
routines provided by the agent owner. }
}
@Article{Kehr:2000:SV,
author = {Roger Kehr},
title = {Spontane {Vernetzung}},
journal = {Informatik Spektrum},
year = {2000},
OPTkey = {},
volume = {23},
number = {3},
pages = {161--172},
month = jun,
OPTnote = {},
annote = {Good survey of the three main methods for spontaneous
networking (Jini, SLP, UPnP). Also discusses Bluetooth.
Good starting point for german readers.}
}
@TechReport{Kloppenburg:2000:CDG,
author = {Sven Kloppenburg and Felix C. {G\"artner}},
title = {Consistent detection of global predicates in
asynchronous systems with crash failures},
institution = {Darmstadt University of Technology, Department of
Computer Science},
year = {2000},
OPTkey = {},
OPTtype = {},
number = {TUD-BS-2000-01},
address = {Darmstadt, Germany},
month = feb,
url = "\url{http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-2000-01.abstract.html}",
OPTnote = {},
OPTannote = {}
}
@InProceedings{Kulkarni:2000:AAF,
author = {Sandeep S. Kulkarni and Anish Arora},
title = {Automating the addition of fault-tolerance},
booktitle = {Formal Techniques in Real-Time and Fault-Tolerant
Systems, 6th International Symposium (FTRTFT 2000)
Proceedings},
OPTcrossref = {},
OPTkey = {},
pages = {82--93},
year = {2000},
editor = {Mathai Joseph},
OPTvolume = {},
number = {1926},
series = ser-LNCS,
address = {Pune, India},
month = sep,
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {Looks at automatically adding detectors and correctors
in the sense of \cite{Arora:1998:DCT} to existing programs.
Specifications are fusion- and suffix-closed, giving
``bad'' transitions which violate safety. Idea of fail-safe
fault-tolerance is to cut away all paths leading to
these bad transitions. This must be possible without
changing the original behavior. Non-masking fault-tolerance
is achieved by adding transitions from all states outside
of the invariant to states within. Masking fault-tolerance
is somewhat more complex. States that adding fault-tolerance
is NP-complete but refers the proof to a TR. From a
conceptual point of view is similar to
\cite{Gaertner:2000:RIS}. An important point in the
transformation is that the fault-tolerant version must
not contain ``new'' ways to satisfy the specification.}
}
@Article{Kumagai:2000:LEV,
author = {Jean Kumagai},
title = {faults \& failures: {London} stock exchange vanishes
for 8 hours},
journal = {IEEE Spectrum},
year = {2000},
OPTkey = {},
volume = {37},
number = {6},
pages = {30--31},
month = jun,
OPTnote = {},
annote = {Sketches the 8 hour blackout of the London stock
exchange (LSE) on April 5, 2000. Slow overnight batch jobs
had caused old prices to get mixed up with new prices.
Frantic calls from traders pursuaded the LSE to delay
trading until the problem was fixed (trading is useless
with wrong prices). This delay lasted 8 hours. The
reason for the slow batch job was an inherent program
inefficiency combined with an unusually high volume
of data. Fixing required rewriting a couple of lines
of code --- ``absolutely trivial''. Costs are
estimated in the millions of pounds.}
}
@Article{Lamport:2000:FAH,
author = "Leslie Lamport",
title = "Fairness and hyperfairness",
pages = "239--245",
year = "2000",
abstract = "The notion of fairness in trace-based formalisms is
examined. It is argued that, in general, fairness means
machine closure. The notion of hyperfairness introduced
by Attie, Francez, and Grumberg is generalized to
arbitrary action systems. Also examined are the
fairness criteria proposed by Apt, Francez, and Katz.",
url = "http://link.springer.de/link/service/journals/00446/papers/0013004/00130239.pdf",
volume = "13",
number = "4",
journal = "Distributed Computing",
annote = "There's a good quote here about reasoning about liveness
properties: ``Fairness condiations are a way of expressing
liveness properties, and liveness properties are inherently
problematic. The question of whether a real system satisfies a
liveness property is meaningless; it can be answered only by
observing the system for an infinite length of time, and real
systems don't run forever. Liveness is always an approximation to
the property we really care about. We want a program to terminate
within 100 years, but proving that it does would require addition
of distracting timing assumptions. So, we prove the weaker
condition that the program eventually terminates. This doesn't
prove that the program will terminate within our lifetimes, but it
does demonstrate the absence of infinite loops.'' This is a
must-read paper for people interested in liveness issues."
}
@InProceedings{Lano:2000:IBS,
author = {K. Lano and David Clark and K. Androutsopoulos and P. Kan},
title = {Invariant-based synthesis of fault-tolerant systems},
booktitle = {Formal Techniques in Real-Time and Fault-Tolerant
Systems, 6th International Symposium (FTRTFT 2000)
Proceedings},
OPTcrossref = {},
OPTkey = {},
pages = {46--57},
year = {2000},
editor = {Mathai Joseph},
OPTvolume = {},
number = {1926},
series = ser-LNCS,
address = {Pune, India},
month = sep,
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {Uses a precise formal semantics of statecharts to
compositionally develop and verify systems. Presents a fault-tolerant
production cell as a case study.}
}
@TechReport{Larrea:2000:ECF,
author = {Mikel Larrea and Antonio {Fern\'andez} and
Sergio {Ar\'valo}},
title = {Eventually consistent failure detectors},
institution = {Universidad {P\'ublica} de Navarra, Spain},
year = {2000},
OPTkey = {},
OPTtype = {},
OPTnumber = {},
OPTaddress = {},
month = apr,
note = {Presented as a brief announcement at DISC2000},
url = "\url{http://www.gsd.unavarra.es/pres/miembros/mikel/consistent.ps}",
annote = {A new class of failure detectors is presented called
`eventually consistent'. The weak accuracy property is enriched by
a function with which processes can identify the `sommon one'
process which is not wrongly suspected. This can be seen as a type
of leader election capability. Eventually consistent failure detectors
lie between eventually perfect and eventually strong ones. The additional
information offered by this failure detector allows more efficient
consensus algorithms. Since everybody eventually focusses on one
and the same process as a coordinator, consensus algorthms are possible
which do not rely on the rotating coordinator paradigm. This is part
of Mikel's PhD research (see also \cite{Larrea:2000:OIW}).}
}
@InProceedings{Larrea:2000:OIW,
author = {Mikel Larrea and Antonio Fern\'andez and
Sergio Ar\'evalo},
title = {Optimal Implementation of the Weakest Failure Detector
for Solving Consensus},
booktitle = pro-srds2000,
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {2000},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {N\"urnberg, Germany},
month = oct,
OPTorganization = {},
publisher = pub-IEEE,
OPTnote = {},
OPTannote = {}
}
@InProceedings{Mantel:2000:CSM,
author = {Heiko Mantel and Felix C. {G\"artner}},
title = {A case study in the
mechanical verification of fault tolerance},
booktitle = {Proceedings of the 13th
International Florida Artificial Intelligence
Conference (FLAIRS-2000)},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {2000},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Orlando, FL},
month = may,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
annote = {Preliminary version available as TR \cite{Mantel:1999:CSM}.}
}
@Article{Mantel:2000:ACS,
author = {Heiko Mantel and Felix C. {G\"artner}},
title = {A case study in the
mechanical verification of fault tolerance},
journal = {Journal of Experimental \& Theoretical Artificial
Intelligence (JETAI)},
year = {2000},
OPTkey = {},
volume = {12},
number = {4},
pages = {473--488},
month = oct,
OPTnote = {},
OPTannote = {}
}
@PhdThesis{Masum:2000:NCB,
author = {Asif Masum},
title = {Non-cooperative {Byzantine} failures: {A} new framework
for the design of efficient fault tolerance protocols},
school = {Universit\"at-Gesamthochschule Essen, Fachbereich Mathematik
und Informatik},
year = {2000},
OPTkey = {},
OPTtype = {},
OPTaddress = {},
OPTmonth = {},
note = {Published by Libri Books on demand, ISBN 3-8311-0815-3.},
annote = {Conference version e.g. \cite{Echtle:1999:UCB}. Good
overview over failure classification schemes.}
}
@article{Matsui:2000:FTS,
volume ={E83-D},
number ={10},
pages ={1831--1840},
year ={2000},
month =oct,
journal ={IEICE Transactions},
publisher ={Institute of Electronics, Information and
Communication Engineers},
title ={Fault-Tolerant and Self-Stabilizing Protocols Using an
Unreliable Failure Detector},
author ={H. Matsui and M. Inoue and T. Masuzawa and H. Fujiwara},
abstract ={We investigate possibility of fault-tolerant and
self-stabilizing protocols (ftss protocols) using an unreliable
failure detector. Our main contribution is (1) to newly introduce
k-accuracy of an unreliable failure detector, (2) to show that
k-accuracy of a failure detector is necessary for any ftss k-group
consensus protocol, and (3) to present three ftss k-group
consensus protocols using a k-accurate and weakly complete failure
detector under the read/write daemon on complete networks and on
(n-k+1)-connected networks, and under the central daemon on
complete networks.}, keywords ={distributed algorithms;
self-stabilization; fault-tolerance; failure detector; x-group
consensus},
annote = "The term $k$-accuracy means that at least $k$ correct
processes will not be wrongly suspected by the failure
detector. $k=1$ is the same as weak accuracy while $k=n-t$ is the
same as strong accuracy. (See also the eventual consistency
definition of \cite{Larrea:2000:ECF}.) In a $k$-group consensus
protocol all correct processes must eventually choose the same
group of $k$ processes. This looks something like self-stabilizing
$k$ leader election."
}
@InProceedings{Mittal:2000:DDP,
author = "Neeraj Mittal and Vijay K. Garg",
title = "Debugging Distributed Programs Using Controlled
Re-execution",
pages = "239--248",
booktitle = "Proceedings of the 19th Annual {ACM} Symposium on
Principles of Distributed Computing ({PODC}-00)",
month = jul # " ~16--19",
publisher = "ACM Press",
address = "NY",
year = "2000",
annote = "Controlled re-execution means to execute a distributed program
so that a given safety property is maintained during that execution.
The authors identify a class of predicated for which this can be
done efficiently, i.e. without much synchronization. There are some
resemblances here to Schneider's enforceable security policies
cite{Schneider:2000:ESP}."
}
@InProceedings{Mostefaoui:2000:KSA,
author = "Achour {Most\'efaoui} and Michel Raynal",
title = "{\it{k}}-Set Agreement with Limited Accuracy Failure
Detectors",
pages = "143--152",
booktitle = "Proceedings of the 19th Annual {ACM} Symposium on
Principles of Distributed Computing ({PODC}-00)",
month = jul # " ~16--19",
publisher = "ACM Press",
address = "NY",
year = "2000",
annote = "Looks at the $k$-set agreement of \cite{Chaudhuri:1990:AHC}
and shows the possibility and impossibility of solving it under
different assumptions which include a failure detector with limited
scope. Informally, the scope of the accuracy property is the number
of processes that may not suspect a correct process."
}
@InProceedings{Namjoshi:2000:CCR,
author = {Kedar S. Namjoshi and Richard J. Trefler},
title = {On the completeness of compositional reasoning},
booktitle = {Proceedings of the 12th Int. Conference on
Computer Aided Verification (CAV2000)},
OPTcrossref = {},
OPTkey = {},
pages = {139--153},
year = {2000},
OPTeditor = {},
OPTvolume = {},
number = {1855},
series = ser-LNCS,
OPTaddress = {},
month = jul,
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {Gives examples of non-circular compositional reasoning,
unlike \cite{Abadi:1993:CS} which is also shown to be incomplete.}
}
@Article{Oberg:2000:NBP,
author = {James Oberg},
title = {{NASA's} big push for the space station},
journal = {IEEE Spectrum},
year = {2000},
OPTkey = {},
volume = {37},
number = {11},
pages = {49--54},
month = nov,
OPTnote = {},
annote = {Describes problems and workarounds while deploying the
new space station ISS. States that the software on the ISS is far from
well tested because of the tight schedule. Cite: ``We launched the
Space Shuttle when we were 90 percent ready, but we're launching
Space Station at only 50 percent.'' An example of planning flaws is
the construction of Plasma Contact Units (PCU): because the ISS runs
130-180 V power (instead of 24-28 V in earlier designs) and orbits
in thin plasma, a voltage threshold for arcing (which is at about
40 to 60 Vdc is surpassed by the outer skin of the spacecraft which
endangers solar cells and outboard equipment and causes hazards
for astronauts on space walks. Two PCUs were added to the design
which are ion beams constantly shooting ions into space to decrease
the electric potential. If one PCU breaks down, the other can still
relieve the potential, but to fix a broken PCU a spacewalk is
required! (The procedures now are to shut down part of the ISS
in this situation and only run 24-28 V during repair.) Astronauts
use IBM 760 Thinkpad laptop computers on board! Shows that it is
good to still rely on heavy duty experienced technology like
Mir and Sojus.}
}
@InProceedings{Pagnia:2000:SFE,
author = {Henning Pagnia and Holger Vogt and Felix
C. G\"artner and Uwe G. Wilhelm},
title = "Solving Fair Exchange with Mobile Agents",
booktitle = asama2000,
pages = "57--72",
year = 2000,
address = "Zurich, Switzerland",
month = sep,
volume = "1882",
series = ser-LNCS,
publisher = pub-SV,
keyword = "mobile agent, e-commerce, security",
abstract = { Mobile agents have been advocated to support
electronic commerce over the Internet. While being a
promising paradigm, many intricate problems need to
be solved to make this vision reality. The problem
of \emph{fair exchange} between two agents is one
such fundamental problem. Informally speaking, this
means to exchange two electronic items in such a way
that neither agent suffers a disadvantage. We study
the problem of fair exchange in the mobile agent
paradigm. We show that while existing protocols for
fair exchange can be substantially simplified in the
context of mobile agents, there are still many
problems related to security which remain difficult
to solve. We propose three increasingly flexible
solutions to the fair exchange problem and show how
to implement them using existing agent
technology. The basis for ensuring the security
properties of fair exchange is a tamper-proof
hardware device called a trusted processing
environment. },
}
@Article{Perry:2000:DAR,
author = {Tekla S. Perry},
title = {faults \& failures: Does anybody really know what time
it is?},
journal = {IEEE Spectrum},
year = {2000},
OPTkey = {},
volume = {37},
number = {10},
pages = {26--28},
month = oct,
OPTnote = {},
annote = {Another amusing story in this regular column: studies
the reasons behind the problem of VCRs not adjusting
to the right time. For much of 1999, video cassette
recorders (VCRs) around the U.S. were showing the wrong
time. It affected only machines which had an automatic
time adjuster builtin. This adapter reads the time
which is broadcasted as part of the public broadcasting
service (PBS, in German it's Videotext) and adjusts the
VCRs clock to it. The reason for this fault was an
incorrect time broadcasted by some PBS stations and it
took months to locate it. This was due to hardly any
user response (an article of a journalist triggered
a wide response after months) and due to PBS providers
not knowing how to set the broadcasted time correctly.}
}
@InProceedings{Pleisch:2000:MFT,
author = {Stefan Pleisch and {Andr\'e} Schiper},
title = {Modeling fault-tolerant mobile agent execution as a
sequence of agreement problems},
booktitle = pro-srds2000,
OPTcrossref = {},
OPTkey = {},
pages = {11--20},
year = {2000},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {N\"urnberg, Germany},
month = oct,
OPTorganization = {},
publisher = pub-IEEE,
OPTnote = {},
OPTannote = {}
}
@TechReport{Prasetya:2000:FFT,
author = {I. S. W. B. Prasetya and S. D. Swierstra},
title = {Factorizing Fault Tolerance},
institution = {University of Utrecht, Department of Computer Science},
year = {2000},
OPTkey = {},
OPTtype = {},
number = {UU-CS-2000-02},
address = {Utrecht, The Netherlands},
OPTmonth = {},
note = {Appears in special issue of TCS on fault tolerance},
annote = {This is the paper which first introduced me to
the issue of composition of liveness properties.
The paper proposes a composition law which is based
on the notion of `temporal non-interference'. This
means the following: Given a component $P$ which
satisfies $p\leadsto q$ and a component $Q$ which
does not interfere with $P$'s progress as long as
some flag $a$ is high, then the parallel composition
of $P$ and $Q$ satisfies $p\leadsto q$ if $P$ raises
$a$ long enough. The point is that unlike the usual
composition of e.g. self-stabilizing algorithms
(like in \cite{Herman:1991:ATD,Dolev:1993:SDS} and
also \cite{Gouda:1991:SCP}) the component $Q$ may
interfere with $P$ at some times (but only after
$P$ has reached progress). The composition law is
applied in an example where fault-tolerance is
achieved through exception handling. The framework
is built on top of UNITY \cite{Chandy:1988:PPD} and
checked using HOL.}
}
@Article{Randell:2000:TML,
author = "Brian Randell",
title = "{Turing Memorial Lecture}: Facing Up to Faults",
journal = j-COMP-J,
volume = "43",
number = "2",
pages = "95--106",
year = "2000",
url = "http://www3.oup.co.uk/computer_journal/hdb/Volume_43/Issue_02/430095.sgm.abs.html;
http://www3.oup.co.uk/computer_journal/hdb/Volume_43/Issue_02/430095.pdf",
annote = "A wise and cunning look back at the central problems in
fault tolerance from the viewpoint of one of the big
men. Mentions Babbage's concern about correct mathematical
navigation tables (see also \cite{Bowen:1993:SCS}) and his first
ideas of n version programming. Looks on the necessity of
fault-tolerant computing (``the more dependable computing systems
become, the more dependence is placed on them''). Recalls
concepts from \cite{Laprie:1992:DBC} and explicitly notes that
the quality of fault-tolerance depends heavily on the quality of
the fault assumption (p.100). Quote: ``Yet all too often,
inadequate attention i paid to identifying and justifying a set
of fault assumptions''. Notes the problems with feature
interaction and non-interference when it comes to
compositionality. Quote: ``All fault tolerance involves the use
of redundancy---of representation and/or activity---whose
consistency can be checked.'' Notes that notions of diversity are
not very well understood and that ad hoc standards in operating
systems are a problem when it comes to fault tolerance through
system diversity."
}
@Article{Schoder:2000:TOR,
author = "Detlef Schoder and Torsten Eymann",
title = "Technical opinion: The real challenges of mobile
agents",
journal = j-CACM,
volume = "43",
number = "6",
pages = "111--112",
month = jun,
year = "2000",
coden = "CACMA2",
ISSN = "0001-0782",
bibdate = "Mon Sep 25 15:22:32 MDT 2000",
url = "http://www.acm.org/pubs/citations/journals/cacm/2000-43-6/p111-schoder/",
acknowledgement = ack-nhfb,
subject = "Computer Systems Organization ---
Computer-Communication Networks --- General (C.2.0);
Computer Systems Organization ---
Computer-Communication Networks --- Distributed Systems
(C.2.4); Computing Methodologies --- Artificial
Intelligence --- Distributed Artificial Intelligence
(I.2.11)",
annote = "states that mobile agents should have a kind of
self-stabilizing social behavior. Contrasts nicely to
\cite{Lange:1999:SGR}."
}
@Article{Schulzki:2000:KI,
author = {Christiane Schulzki-Haddouti},
title = {{Kritische Infrastrukturen}},
journal = {FIfF Kommunikation},
year = {2000},
OPTkey = {},
OPTvolume = {},
OPTnumber = {3},
pages = {19--20},
month = sep,
OPTnote = {},
annote = {Teil des Sonderheftes \cite{Bernhardt:2000:RDR}.}
}
@Article{Schumacher:2000:AI,
author = {M. Schumacher and M.L. Moschgath and U. Roedig},
title = {{Angewandte Informationssicherheit} --- {Ein
Hacker-Praktikum an Universit\"aten}},
journal = {Informatik Spektrum},
year = {2000},
OPTkey = {},
volume = {23},
number = {3},
pages = {202--211},
month = jun,
OPTnote = {},
annote = {Presents an interesting course taught at TU Darmstadt:
Students had to attack and defend a network of PCs to
learn the practices of ``real'' network security.}
}
@InProceedings{Stoller:2000:EDG,
author = {Scott D. Stoller and Leena Unnikrishnan and Yanhong A. Liu},
title = {Efficient detection of global properties in distributed
systems using partial-order methods},
booktitle = {Computer Aided Verification (CAV 2000)},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {2000},
OPTeditor = {},
volume = {1855},
OPTnumber = {},
series = ser-LNCS,
OPTaddress = {},
OPTmonth = {},
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
annote = {Uses the ``persistent-set technique'' (a method known
from partial order research to optimize state space search) to
detect possibly and definitely in distributed computations. The
algorithm is compared to two special case algorithms by Garg
and Waldecker and it is shown to (a) handle a larger class of
predicates, and (b) have the same worst case aymptotic time
complexity. Results are backed by simulation data.}
}
@Book{Tel:2000:IDA,
author = {Gerard Tel},
ALTeditor = {},
title = {Introduction to Distributed Algorithms},
publisher = {Cambridge University Press},
year = {2000},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
edition = {Second},
OPTmonth = {},
OPTnote = {},
OPTannote = {}
}
@InProceedings{Verissimo:2000:TCB,
author = "Paulo Ver\'{\i}ssimo and Antonio Casimiro and Christof Fetzer",
title = "The Timely Computing Base: Timely Actions in the Presence
of Uncertain Timeliness",
booktitle = "Proceedings of the International Conference on
Dependable Systems and Networks",
year = "2000",
abstractURL = "http://www.di.fc.ul.pt/~casim/papers/dsn00/dsn00.html",
documentURL = "http://www.di.fc.ul.pt/~casim/papers/dsn00/dsn00.ps.gz",
pages = "533--542",
publisher = "IEEE Computer Society Press",
address = "New York City, USA",
month = jun,
annote = "[to read]"
}
@PhdThesis{Voelzer:2000:FRK,
author = {Hagen {V\"olzer}},
title = {{Fairness, Randomisierung und Konspiration in
verteilten Algorithmen}},
school = {Humboldt Universit\"at zu Berlin, Fakult\"at f\"ur
Informatik},
year = {2000},
OPTkey = {},
OPTtype = {},
OPTaddress = {},
month = dec,
OPTnote = {},
OPTannote = {},
url = "\url{http://dochost.rz.hu-berlin.de/abstract.php3/dissertationen/voelzer-hagen-2000-12-08}",
}
@Article{Wang:2000:PDA,
author = "Wenli Wang and Zolt{\'a}n Hidv{\'e}gi and Andrew D.
{Bailey, Jr.} and Andrew B. Whinston",
title = "{E}-Process Design and Assurance Using Model
Checking",
journal = "Computer",
volume = "33",
number = "10",
pages = "48--53",
month = oct,
year = "2000",
url = "http://www.computer.org/computer/co2000/rx048abs.htm;
http://dlib.computer.org/co/books/co2000/pdf/rx048.pdf",
abstract = "Using a simple online ticket sales example and the
authors demonstrate that model checking can help
businesses verify their e-processes.",
annote = "Shows that with model checking you can do model
checking. Nothing particular to e-commerce or
security (unfortunately)."
}
@Book{Bergstra:2001:HPA,
editor = {Jan A. Bergstra and Alban Ponse and Scott A. Smolka},
title = {Handbook of Process Algebra},
publisher = {North-Holland},
year = {2001},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
OPTannote = {schon da?}
}
@Article{Furse:2001:DTW,
author = {Cynthia Furse and Randy Haupt},
title = {Down to the Wire},
journal = {IEEE Spectrum},
year = {2001},
OPTkey = {},
volume = {38},
number = {2},
pages = {34--39},
month = feb,
OPTnote = {},
annote = {Drastic feature about the risks of aging wiring in
aircraft. Airplanes stay in use for more than 20 years and so
many parts are in danger of failing because of age. Especially
wires are critical because the aircraft is full of them and they
cannot be easily replaced. Studies show that in 20+ years old
aircraft there is between 1.6 and 13 cracks per 1000 meter
wires (there are about 240 km of wire in a Lockheed L-1011).
Similar things count for military jets which stay in operation
much longer (B-52s for example for 80 years). Faults can lead
to sparks, fire, information loss, transient communication loss.
Diagnosis tools are already good, but what is needed is
prognosis.}
}
@PhdThesis{Gaertner:2001:FGF,
author = {Felix C. {G\"artner}},
title = {Formale Grundlagen der Fehlertoleranz in verteilten
Systemen},
school = {Fachbereich Informatik, TU Darmstadt},
year = {2001},
OPTkey = {},
OPTtype = {},
OPTaddress = {},
month = may,
note = {},
OPTannote = {},
url = "\url{http://elib.tu-darmstadt.de/diss/000162/}",
}
@InProceedings{Gaertner:2001:DRF,
author = {Felix C. G\"artner and Hagen V\"olzer},
title = {Defining Redundancy in Fault-Tolerant Computing},
booktitle = {Brief Announcement at the 15th International
Symposium on DIStributed Computing (DISC 2001)},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {2001},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Lisbon, Portugal},
month = oct,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
OPTannote = {}
}
@TechReport{Gaertner:2001:GIF,
author = {Felix C. G\"artner},
title = {A gentle introduction to failure detectors and related
problems},
institution = {Darmstadt University of Technology, Department of
Computer Science},
year = {2001},
OPTkey = {},
OPTtype = {},
number = {TUD-BS-2001-01},
OPTaddress = {},
month = apr,
OPTnote = {},
url = "\url{http://www.informatik.tu-darmstadt.de/BS/Gaertner/publications/TUD-BS-2001-01.ps.gz}",
annote = {A more informal introduction to defining and using
unreliable failure detectors \cite{Chandra:1996:UFD}
in the design and analysis of fault tolerant distributed
algorithms.}
}
@InProceedings{Gaertner:2001:IPD,
author = {Felix C. G\"artner and Stefan Pleisch},
title = {{(Im)Possibilities} of predicate detection in
crash-affected systems},
booktitle = {Proceedings of the 5th Workshop on Self-Stabilizing Systems
(WSS2001)},
OPTcrossref = {},
OPTkey = {},
OPTpages = {},
year = {2001},
OPTeditor = {},
OPTvolume = {},
number = {2194},
pages ={98--113},
series = ser-LNCS,
address = {Lisbon, Portugal},
month = oct,
OPTorganization = {},
publisher = pub-SV,
note = {},
OPTannote = {}
}
@TechReport{Gaertner:2001:IPDIBM,
author = {Felix C. {G\"artner} and Stefan Pleisch},
title = {{(Im)Possibilities} of Predicate Detection in Crash-Affected Systems},
institution = {IBM Research Laboratory, Zurich},
year = {2001},
OPTkey = {},
type = {Research Report},
number = {RZ 3361 (\# 93407)},
address = {},
month = aug,
url = "\url{http://domino.watson.ibm.com/library/CyberDig.nsf/Search}",
OPTnote = {},
OPTannote = {}
}
@Misc{LeLann:2001:ART,
OPTkey = {},
author = {Gerard LeLann},
title = {Is asynchronous real-time an oxymoron?},
howpublished = {Invited presentation at the 15th International
Symposium on DIStributed Computing (DISC 2001)},
month = oct,
year = {2001},
note = {Lisbon, Portugal},
OPTannote = {related reference is \cite{LeLann:1995:ORN}. Is this
published anywhere?}
}
@InProceedings{Aguilera:2002:OIF,
author = {Marcos K. Aguilera and {G\'erard} Le Lann and Sam Toueg},
title = {On the impact of fast failure detectors on real-time
fault-tolerant systems},
booktitle = {Proceedings of the 16th International
Symposium on DIStributed Computing (DISC 2002)},
crossref = {Mahlki:2002:DC},
OPTkey = {},
pages = {354--369},
year = {2002},
editor = {Dahlia Malkhi},
OPTvolume = {},
number = {2508},
series = ser-LNCS,
address = {Toulouse, France},
month = oct,
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
OPTannote = {}
}
@InProceedings{Gaertner:2002:FDS,
author = {Felix C. {G\"artner} and Stefan Pleisch},
title = {Failure detection sequencers: {Necessary} and sufficient
information about failures to solve predicate detection},
booktitle = {Proceedings of the 16th International
Symposium on DIStributed Computing (DISC 2002)},
crossref = {Mahlki:2002:DC},
OPTkey = {},
pages = {280--294},
year = {2002},
editor = {Dahlia Malkhi},
OPTvolume = {},
number = {2508},
series = ser-LNCS,
address = {Toulouse, France},
month = oct,
OPTorganization = {},
publisher = pub-SV,
OPTnote = {},
OPTannote = {}
}
@TechReport{Gaertner:2002:FDSIBM,
author = {Felix C. {G\"artner} and Stefan Pleisch},
title = {Failure detection sequencers: {Necessary} and sufficient
information about failures to solve predicate detection},
institution = {IBM Research Laboratory, Zurich},
year = {2002},
OPTkey = {},
type = {Research Report},
number = {RZ 3438},
address = {},
OPTmonth = aug,
url = "\url{http://domino.watson.ibm.com/library/CyberDig.nsf/Search}",
OPTnote = {},
OPTannote = {}
}
@TechReport{Gaertner:2002:RLPCSS,
author = {Felix C. {G\"artner}},
title = {Revisiting Liveness Properties in the Context of Secure
Systems},
institution = {Swiss Federal Institute of Technology (EPFL), School of
Computer and Communication Sciences},
year = {2002},
OPTkey = {},
OPTtype = {},
number = {200278},
address = {Lausanne, Switzerland},
month = nov,
OPTnote = {},
OPTannote = {}
}
@Article{Guerraoui:2002:NBA,
author = {Rachid Guerraoui},
title = {Non-Blocking Atomic Commitment in Asynchronous Systems
with Failure Detectors},
journal = j-DC,
year = {2002},
OPTkey = {},
volume = {15},
number = {1},
OPTpages = {},
OPTmonth = {},
OPTnote = {},
OPTannote = {}
}
@InProceedings{Guerraoui:2002:WFD,
author = {Rachid Guerraoui and Petr Kouznetsov},
title = {On the weakest failure detector for non-blocking
atomic commit},
OPTcrossref = {},
OPTkey = {},
booktitle = {Proceedings of the International Conference on Theoretical
Computer Science (TCS 2002), 17th IFIP World Computer
Congress},
OPTpages = {},
year = 2002,
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
address = {Monteal, Canada},
month = aug,
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
OPTannote = {}
}
@Article{Hermant:2002:FAU,
author = {{Jean-Fran\c{c}ois} Hermant and {G\'erard} Le Lann},
title = {Fast asynchronous uniform consensus in real-time
distributed systems},
journal = j-IEEE-TRANS-COMP,
year = {2002},
OPTkey = {},
volume = {51},
number = {8},
pages = {931--944},
month = aug,
OPTnote = {},
annote = {A very relavant paper regarding the practicality of
the failure detector approach. The basic idea of the paper is to
use the principle of `late binding' (known from programming
languages) to build real-time distributed protocols from
asynchronous solutions for the `time-free' version of the
problem. The approach is as follows: for a real-time problem, (1)
turn the specification into a time-free problem (e.g. by basing
timeliness requirements on certain activation conditions using
time-free extensions to the asynchronous model - like failure
detectors), then devise an asynchronous solution, (2) design a
solution to the time-free extension in an as weak partially
synchronous model as possible, (3) if the original problem is a
real-time problem or in case one needs to predict real-time
behavior, bind the parameters of the time-free extension to some
possibly stronger partially synchronous model and establish time
bounds for the extension, from that establish time bounds for the
overall algorithm. Why is late binding good? First of all,
devising solutions in this way results in systems that satisfy
safety and liveness with the highest amount of coverage possible
under the fault assumption (the coverage of the asynchronous model
- because it makes no assumption - is higher than any (partially)
synchronous model).Second, early binding of a solution makes you
have to reason about timing and scheduling even if the original
problem is not a real-time computing problem. The paper shows how
late binding can be done using uniform consensus based on a strong
failure detector (using which algorithm?), implementing the
failure detector in a real-time Ethernet, and from that deriving a
fast uniform consensus algorithm. This approach also has the
advantage that failure detection has expedited delivery and so the
failure detection time can be magnitutes smaller than regular
message delivery (see also \cite{Aguilera:2002:OIF}). The
timed-asynchronous (TA) system model \cite{Cristian:1999:TAD} and
the timely computing base (TCB) \cite{Verissimo:2000:TCB} all do
early binding. These models try then to enforce timing assumptions
by what here is called ``measure-compare-and-kill'' (similar to
the ``process controlled crash'' explained in
\cite[p.14]{Defago:2000:ARP} used in ISIS and other systems). This
means that a continuing timing failure detection takes place and
that late services are treated as omissions, and it assumes that
every timing failure is detected to maintain the confidence in the
correctness. However, this means to perform scheduling and
real-time analyses almost everywhere in the system, which can be
tough. If these bounds are violated, the system might even lose
liveness. The paper concludes (rather strongly) that ``TA and TCB
lead to inefficient working solutions.'' Overall, this paper is
both conceptual and technical (with a lot of real-time stuff) and
argues strongly for its points. Some material presented at DISC
2001 \cite{LeLann:2001:ART}.}
}
@InProceedings{Jhumka:2002:SDC,
author = {Arshad Jhumka and Martin Hiller and Vilgot Claesson and
Neeraj Suri},
title = {On systematic design of consistent executable assertions
for distributed embedded software},
OPTcrossref = {},
OPTkey = {},
booktitle = {Proceedings of the ACM Joint Conference on Languages,
Compilers and Tools for Embedded Systems/Software
and Compilers for Embedded Systems (LCTES/SCOPES)},
pages = {74--83},
year = {2002},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTmonth = {},
OPTorganization = {},
OPTpublisher = {},
OPTnote = {},
OPTannote = {}
}
@TechReport{Jhumka:2002:OSD,
author = {Arshad Jhumka and Felix C. {Gärtner} and
Christof Fetzer and Neeraj Suri},
title = {On Systematic Design of Fast and Perfect Detectors},
institution = {Swiss Federal Institute of Technology (EPFL), School of
Computer and Communication Sciences},
year = {2002},
OPTkey = {},
OPTtype = {},
number = {200263},
address = {Lausanne, Switzerland},
month = sep,
OPTnote = {},
OPTannote = {}
}
@InProceedings{Kulkarni:2002:CAF,
author = {Sandeep S. Kulkarni and A. Ebnenasir},
title = {Complexity of adding failsafe fault-tolerance},
OPTcrossref = {},
OPTkey = {},
booktitle = {Proceedings of the 22nd IEEE International Conference
on Distributed Computing Systems (ICDCS 2002)},
pages = {337--344},
year = {2002},
OPTeditor = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
month = jul,
OPTorganization = {},
publisher = pub-IEEE,
OPTnote = {},
OPTannote = {}
}
@Book{Malkhi:2002:DC,
editor = {Dahlia Malkhi},
title = {Distributed Computing. 16th International Conference
(DISC 2002)},
publisher = pub-SV,
year = {2002},
OPTkey = {},
OPTvolume = {},
number = {2508},
series = ser-LNCS,
address = {Toulouse, France},
OPTedition = {},
month = oct,
OPTnote = {},
OPTannote = {}
}
@PhdThesis{Muehl:2002:FGF,
author = {Gero {M\"uhl}},
title = {Large-Scale Content-Based Publish-Subscribe Systems},
school = {Fachbereich Informatik, TU Darmstadt},
year = {2002},
OPTkey = {},
OPTtype = {},
OPTaddress = {},
month = nov,
note = {},
OPTannote = {},
url = "\url{http://elib.tu-darmstadt.de/diss/000274/}",
}
@Article{Pagnia:2003:FE,
author = {Henning Pagnia and Holger Vogt and Felix C. {G\"artner}},
title = {Fair Exchange},
journal = j-COMP-J,
year = {2003},
OPTkey = {},
volume = {46},
number = {1},
OPTpages = {},
OPTmonth = {},
OPTnote = {},
OPTannote = {}
}
@Misc{Hadzilacos:FFT,
OPTcrossref = "",
OPTkey = "",
author = "Vassos Hadzilacos and Prasad Jayanti and Sam Toueg",
title = "Fundamentals of Fault-Tolerant Distributed Computing",
howpublished = "Forthcoming",
OPTyear = "",
OPTmonth = "",
OPTnote = "",
annote = "Referenced in \cite{Hadzilacos:1994:MAF} but
obviously has not been published yet."
}
@Misc{MAFTIA,
key = {MAFTIA},
OPTauthor = {},
title = {MAFTIA Home -- {Malicious- and Accidental-Fault Tolerance for
Internet Applications}},
howpublished = {Internet:
\url{http://www.newcastle.research.ec.org/maftia/}},
OPTmonth = {},
OPTyear = {},
OPTnote = {},
OPTannote = {}
}