From b34f19c0d1f133ad84c4144bff332dc21db29995 Mon Sep 17 00:00:00 2001 From: Erik Poll Date: Wed, 15 Feb 2017 10:12:40 +0100 Subject: [PATCH] shit --- acmart.bib | 89 ------------------ conclusions.tex | 19 ++-- introduction.tex | 18 ++-- learning_results.tex | 16 ++-- learning_setup.tex | 192 ++++++++++++++++++++++++++++++++------- preliminaries.tex | 19 +++- security_definitions.tex | 5 +- sigproc.bib | 78 ++++++++-------- the_sut.tex | 28 +++--- 9 files changed, 260 insertions(+), 204 deletions(-) delete mode 100644 acmart.bib diff --git a/acmart.bib b/acmart.bib deleted file mode 100644 index bb56301..0000000 --- a/acmart.bib +++ /dev/null @@ -1,89 +0,0 @@ -@misc{TeXFAQ, - author = {{UK \TeX{} Users Group}}, - howpublished = {\url{http://www.tex.ac.uk}}, - title = {{UK} List of {\TeX} Frequently Asked Questions}, - year = {2016}, -} - -@Manual{Downes04:amsart, - title = {The \textsf{amsart}, \textsf{amsproc}, and - \textsf{amsbook} document~classes}, - author = {Michael Downes and Barbara Beeton}, - organization = {American Mathematical Society}, - year = 2004, - month = {August}, - note = {\url{http://www.ctan.org/pkg/amslatex}} -} - -@Manual{instr-l, - title = {Instructions for Preparation of Papers and - Monographs, {AMS\LaTeX}}, - organization = {American Mathematical Society}, - month = {August}, - year = 2004, - note = {\url{http://www.ctan.org/pkg/amslatex}} -} - -@Manual{Fiorio15, - title = {{a}lgorithm2e.sty---package for algorithms}, - author = {Cristophe Fiorio}, - month = {October}, - year = 2015, - annote = {\url{http://www.ctan.org/pkg/algorithm2e}} -} - -@Manual{Brito09, - title = {The algorithms bundle}, - author = {Rog\'erio Brito}, - month = {August}, - year = 2009, - annote = {\url{http://www.ctan.org/pkg/algorithms}} -} - - -@Manual{Heinz15, - title = {The Listings Package}, - author = {Carsten Heinz and Brooks Moses and Jobst Hoffmann}, - month = {June}, - year = 2015, - note = {\url{http://www.ctan.org/pkg/listings}} -} - -@Manual{Fear05, - title = {Publication quality tables in {\LaTeX}}, - author = {Simon Fear}, - month = {April}, - year = 2005, - note = {\url{http://www.ctan.org/pkg/booktabs}} -} - -@Manual{ACMIdentityStandards, - title = {{ACM} Visual Identity Standards}, - organization = {Association for Computing Machinery}, - year = 2007 -} - -@Manual{Sommerfeldt13:Subcaption, - title = {The subcaption package}, - author = {Axel Sommerfeldt}, - month = {April}, - year = 2013, - note = {\url{http://www.ctan.org/pkg/subcaption}}, -} - - -@Manual{Nomencl, - title = {A package to create a nomenclature}, - author = {Boris Veytsman and Bern Schandl and Lee Netherton - and CV Radhakrishnan}, - month = {September}, - note = {\url{http://www.ctan.org/pkg/nomencl}}, - year = 2005} - -@Manual{Talbot16:Glossaries, - title = {User Manual for glossaries.sty v4.25}, - author = {Nicola L. C. Talbot}, - month = {June}, - year = 2016, - note = {\url{http://www.ctan.org/pkg/subcaption}}} - diff --git a/conclusions.tex b/conclusions.tex index 06faea9..760b550 100644 --- a/conclusions.tex +++ b/conclusions.tex @@ -5,16 +5,21 @@ formalized several security and functional properties drawn from the SSH RFC spe properties on the learned models using model checking and have uncovered several minor inconsistencies, though crucially, the security critical properties were met by all implementation. -Abstraction was provided by a {\dmapper} component placed between the {\dlearner} and the {\dsut}. The {\dmapper} was constructed from an existing SSH -implementation. The alphabet the {\dmapper} exposed to the {\dlearner}, explored key exchange and setting up a secure connection, several authentication methods -and opening and closing single of channels over which the terminal service could be requested. We used two alphabets, a full version for OpenSSH, and -a restricted version for the other implementations. The restricted alphabet was still sufficient to explore most aforementioned behavior. +Abstraction was provided by a {\dmapper} component placed between the +{\dlearner} and the {\dsut}. The {\dmapper} was constructed from an +existing SSH implementation. The input alphabet of the {\dmapper} +explored key exchange, setting up a secure connection, several +authentication methods and opening and closing channels over which the +terminal service could be requested. We used two input alphabets, a +full version for OpenSSH, and a restricted version for the other +implementations. The restricted alphabet was still sufficient to +explore most aforementioned behavior. There were several challenges encountered. Firstly, building a {\dmapper} presented a considerable technical challenge, as it required re-structuring of an actual SSH implementation. Secondly, because we used classical learning algorithms, we had to ensure that the abstracted implementation behaved like a deterministic Mealy Machine. Herein, time induced non-determinism was difficult to eliminate. Buffering also presented problems, -leading to a considerable increase in the number of states. Moreover, the systems analyzed were relatively slow, which meant learning and testing took -several days. This was compounded by the size of the learning alphabet, and it forced us into using a reduced alphabet for two of the analyzed implementations. +leading to a considerable increase in the number of states. Moreover, the systems analyzed were relatively slow, which meant learning took +several days\marginpar{\tiny Erik: For a single server, right??}. This was compounded by the size of the learning alphabet, and it forced us into using a reduced alphabet for two of the analyzed implementations. Limitations of the work, hence elements for future work, are several. First of all, the {\dmapper} was not formalized, unlike in~\cite{TCP2016}, thus we did not produce a concretization of the abstract models. Consequently, model checking results cannot be fully transferred to the actual implementations. Formal definition @@ -25,4 +30,4 @@ can infer systems with parameterized alphabets, state variables and simple opera Despite these limitations, our work provides a compelling application of learning and model checking in a security setting, on a widely used protocol. We hope this lays -some more groundwork for further case studies, as well as fresh advancements in learning techniques. \ No newline at end of file +some more groundwork for further case studies, as well as advances learning techniques. diff --git a/introduction.tex b/introduction.tex index 1681baa..60e29a0 100644 --- a/introduction.tex +++ b/introduction.tex @@ -1,14 +1,20 @@ \section{Introduction}\label{introduction} -The SSH protocol is used interact securely with remote machines. Alongside TLS and IPSec, SSH is amongst the most frequently used security suites~\cite{Albrecht2009Plaintext}. Due to its significant user base and sensitive nature, flaws in the protocol or its implementation could have major impact. It therefore comes as no surprise that SSH has attracted scrutiny from the security community. -The protocol specification has been subjected to various security analyses in \cite{Albrecht2009Plaintext,Bellare2004Breaking,Williams2011Analysis,Paterson2010PlaintextDependent}. -Formal methods have also been applied. Poll et. al. in \cite{Poll_rigorous_2011} formulate a thorough specification of SSH's Transport layer. They then use this specification to verify OpenSSH by manually inspecting the source code. The same specification is later used by Erik Boss\cite{Boss2012} in model based testing of the implementation. Udrea et al.\cite{Udrea_rule-based_2008} use static analysis to check two implementations of SSH against an extensive set of rules. +The SSH protocol is used interact securely with remote machines. Alongside TLS and IPSec, SSH is among the most frequently used network security protocols~\cite{Albrecht2009Plaintext}. Due to its significant user base and sensitive nature, flaws in the protocol or its implementation could have major impact. It therefore comes as no surprise that SSH has attracted scrutiny from the security community. +SSH has been subjected to various security analyses \cite{Albrecht2009Plaintext,Bellare2004Breaking,Williams2011Analysis,Paterson2010PlaintextDependent}. +Formal methods have been applied in analysing implmentations of +SSH: Protocol state machines of SSH's transport layer +were used in a manual code review of OpenSSH +\cite{Poll_rigorous_2011} and a formal program verification of a Java +implementation of SSH \cite{PollSchubert07}. These models have also +been used for model based testing \cite{Boss2012}. +Udrea et al.\cite{Udrea_rule-based_2008} use static analysis to check +two implementations of SSH against an extensive set of rules. -%In \cite{Boss2012}, model based testing was used to verify %Academic researchers have so far focused more on the theoretical aspects than on implementations of the protocol. In this work, we use classical active automata learning, or simply, model learning, to infer state machines of three SSH implementations, which we then verify by model checking. -Model learning has previously been applied to infer state machines of EMV bank cards~\cite{Aarts2013Formal}, electronic passports~\cite{Aarts2010Inference} and hand-held readers for online banking~\cite{Chalupar2014Automated}. More recently, it was used to learn implementations of TCP~\cite{TCP2016} and TLS~\cite{RuiterProtocol}. Model learning's goal is to obtain a state model of a black-box system by providing inputs and observing outputs. The learned state model corresponds to the observed behavior, and can be used in system analysis. Since model learning builds from a finite number of observations, we can never be sure that the learned model is correct. To that end, advanced conformance algorithms are employed\cite{LeeY96}, which yield some confidence that the system inferred is in fact correct. In the context of testing protocols, model learning can be seen as a form of protocol state fuzzing, whereby unexpected inputs are sent to a system under test in the hope of exposing hidden anomalies. In model learning, inputs are sent with no regard to the order imposed by the protocol. Any anomalies are then exposed in the learned model. +Model learning has previously been applied to infer state machines of EMV bank cards~\cite{Aarts2013Formal}, electronic passports~\cite{Aarts2010Inference}, hand-held readers for online banking~\cite{Chalupar2014Automated}, and implementations of TCP~\cite{TCP2016} and TLS~\cite{RuiterProtocol}. Model learning aims to obtain a state machine model of a black-box system by providing inputs and observing outputs. The learned state model corresponds to the observed behavior, and can be used in system analysis. Since model learning builds from a finite number of observations, we can never be sure that the learned model is correct. To that end, advanced conformance algorithms are employed\cite{LeeY96}, which yield some confidence that the system inferred is in fact correct. In the context of testing protocols, model learning can be seen as a form of protocol state fuzzing, whereby unexpected inputs are sent to a system under test in the hope of exposing hidden anomalies. In model learning, inputs are sent with no regard to the order imposed by the protocol. Any anomalies are then exposed in the learned model. Having obtained models, we use model checking to automatically verify their conformance to both functional and security properties. The properties are drawn out of the RFC specifications\cite{rfc4251,rfc4252,rfc4253,rfc4254} and formalized in LTL. They are then checked for truth on the learned model using NuSMV~\cite{NuSMV}. Manually verifying these properties would be difficult, as the learned models are reasonably large. Moreover, formalizing properties means we can also better assess and overcome vagueness or under-specification in the RFC standards. @@ -43,4 +49,4 @@ information available from traces without knowledge of the security key. %Besides security-related logical flaws, inferred state machines can show quirks such as superfluous states. Although these might not be directly exploitable, OpenBSD auditors illustrate why these small bugs should be resolved: ``we are not so much looking for security holes, as we are looking for basic software bugs, and if years later someone discovers the problem used to be a security issue, and we fixed it because it was just a bug, well, all the better''\footnote{\url{http://www.openbsd.org/security.html}}. % %\textit{Organization.} An outline of the SSH protocol will be provided in Section~\ref{ssh}. The experimental setup is discussed in Section~\ref{setup}. The results are subsequently discussed in Section~\ref{results}, after which we conclude in Section~\ref{conclusions}. -% \ No newline at end of file +% diff --git a/learning_results.tex b/learning_results.tex index ec6456e..52f044a 100644 --- a/learning_results.tex +++ b/learning_results.tex @@ -12,7 +12,9 @@ Certain arrangements had to be made including the setting of timing parameters t \begin{figure*} \centering \includegraphics[scale=0.30]{ssh-server} - \caption{OpenSSH server. States are distributed into 3 clusters, one for each layer, plus a state for when connection was lost. + \caption{OpenSSH server. States are collected in 3 clusters, + indicated by the rectangles, where each cluster corresponds to + one of the protocol layers. We eliminate redundant states and information induced by the {\dmapper}, as well as states present in successful rekey sequences. Wherever rekey was permitted, we replaced the rekey states and transitions by a single \textsl{REKEY SEQUENCE} transition. We also factor out edges common to states within a cluster. We replace common disconnecting edges, by one edge from the cluster to the disconnect state. Common self loop edges are colored, and the actual i/o information only appears on one edge. Transitions with similar start and end states are joined together on the same edge. Transition labels are kept short by regular expressions(UA\_* stands for all inputs starting with UA\_) or by factoring out common start strings. Green edges highlight the happy flow. } \label{fig:sshserver} \end{figure*} @@ -38,7 +40,7 @@ distinguishing sequence. The exhaustive variant for a set {\dk}, generates tests namely, that the learned model is correct unless the (unknown) model of the implementation has at least {\dk} more states. The random variant produces tests with randomly generated middle sections. No formal confidence is provided, but past experience shows this to be more effective at finding counterexamples since {\dk} can be set to higher values. We executed a random test suite with {\dk} of 4 comprising 40000 tests for OpenSSH, and 20000 tests for BitVise and DropBear. -We then ran an exhaustive test suite with {\dk} of 2 for for all implementations. +We then ran an exhaustive test suite with {\dk} of 2 for all implementations. Table~\ref{tab:experiments} describes the exact versions of the systems analyzed together with statistics on learning and testing, namely: @@ -62,11 +64,13 @@ DropBear v2014.65 & 17 & 2 & 19863 \label{tab:experiments} \end{table} - The large number of states is down to several reasons. First of all, some systems exhibited buffering behavior. In particular, BitVise would queue -responses for higher layer inputs sent during key re-exchange, and would deliver them all at once, after the exchange was done. Re-exchanging keys (rekey-ing) was also -a major contributor to the number of states. In states allowing rekey, following the sequence of transitions comprising the rekey should lead back to the starting state. This -leads to 2 additional rekey states for every state permitting rekey. A considerable number of states were also added due to {\dmapper} generated outputs such as \textsc{ch\_none} or \textsc{ch\_max}, outputs which signal that no channel is open or that the maximum number of channels have been opened. +responses for higher layer inputs sent during key re-exchange, and would deliver them all at once after the exchange was done. Rekeying was also +a major contributor to the number of states. For each state where rekeying +is possible, the sequence of transitions constituting the complete +rekeying process should lead back to tha state. This +leads to two additional rekeying states for each state where rekeying +is possible . A considerable number of states were also added due to {\dmapper} generated outputs such as \textsc{ch\_none} or \textsc{ch\_max}, outputs which signal that no channel is open or that the maximum number of channels have been opened. Figure~\ref{fig:sshserver} shows the model learned for OpenSSH, with various changes applied to improve readability. The happy flow, contoured in green, is fully explored in the model and mostly matches our earlier description of it\footnote{The only exception is in the transport layer, where unlike in our happy flow definition, the server is the first to send the \textsc{newkeys} message. This is also accepted behavior, as the protocol does not specify which side should send \textsc{newkeys} first.}. Also explored is what happens when a rekey sequence is attempted. We notice that rekey is only allowed in states of the Connection layer. Strangely, for these states, rekey is not state preserving, as the generated output on receiving a \textsc{sr\_auth}, \textsc{sr\_conn} or \textsc{kex30} changes from \textsc{unimpl} to \textsc{no\_resp}. This leads to two sub-clusters of states, one before the first rekey, the other afterward. In all other states, the first step of a rekey (\textsc{kexinit}) yields (\textsc{unimpl}), while the last step (\textsc{newkeys}) causes the system to disconnect. diff --git a/learning_setup.tex b/learning_setup.tex index 5f34e37..54191e5 100644 --- a/learning_setup.tex +++ b/learning_setup.tex @@ -8,7 +8,7 @@ The learning setup consists of three components: the {\dlearner}, {\dmapper} and {\dsut}. The {\dlearner} generates abstract inputs, representing SSH messages. The {\dmapper} transforms these messages into well-formed SSH packets and sends them to the {\dsut}. The {\dsut} sends response packets back to the {\dmapper}, which in turn, translates these packets to abstract outputs. The {\dmapper} then sends the abstract outputs back to the {\dlearner}. -The {\dlearner} uses LearnLib ~\cite{LearnLib2009}, a Java library implementing $L^{\ast}$ based algorithms for learning Mealy machines. The {\dmapper} is based on Paramiko, an open source SSH implementation written in Python\footnote{Paramiko is available at \url{http://www.paramiko.org/}}. We opted for Paramiko because its code is relatively well structured and documented. The {\dsut} can be any existing implementation of an SSH server. The three components communicate over sockets. A graphical representation of our setup is shown in Figure~\ref{fig:components}. +The {\dlearner} uses LearnLib ~\cite{LearnLib2009}, a Java library implementing $L^{\ast}$ based algorithms for learning Mealy machines. The {\dmapper} is based on Paramiko, an open source SSH implementation written in Python\footnote{Paramiko is available at \url{http://www.paramiko.org/}}. We opted for Paramiko because its code is relatively well structured and documented. The {\dsut} can be any existing implementation of an SSH server. The three components communicate over sockets, as shown in Figure~\ref{fig:components}. \begin{figure} \centering \includegraphics[scale=0.29]{components.pdf} @@ -16,28 +16,45 @@ The {\dlearner} uses LearnLib ~\cite{LearnLib2009}, a Java library implementing \label{fig:components} \end{figure} -SSH is a complex client-server type protocol. It would be exceedingly difficult to learn all its facets, thus we narrow down the learning goal to learning SSH -server implementations. We further restrict learning to only exploring the terminal service of the Connection layer, as we consider it to be the most interesting -from a security perspective. Algorithms for encryption, compression and hashing are set to default settings and are not purposefully explored. Also, the starting -state of the {\dsut} is one where a TCP connection has already been established and where SSH versions have been exchanged, a required first step of the Transport layer protocol. +SSH is a complex client-server protocol. In our work so far we concentrated on learning models of the implementation of the server, and not of the client. +We further restrict learning to only exploring the terminal service of the Connection layer, as we consider it to be the most interesting +from a security perspective. Algorithms for encryption, compression and hashing are left to default settings and are not purposefully explored. Also, the starting + +state of the {\dsut} is one where a TCP connection has already been established and where SSH versions have been exchanged, which are prerequisites for starting the Transport layer protocol. %figure %It is therefore important to focus on messages for which interesting state-changing behaviour can be expected. + \subsection{The learning alphabet}\label{subsec:alphabet} -We split the learning alphabet into three groups, corresponding to the three layers. %Each input in the alphabet has a corresponding message number in the SSH specification. -Learning doesn't scale with a growing alphabet, hence it is important to reduce the alphabet to those inputs that trigger interesting behavior. We do this by -only selecting inputs that are consistent with our learning goal. -Since we are learning only SSH server implementations, we filter out messages that were not intended to be sent to the server. \footnote{Applying this principle to -the RFC's messages results in not including \textsc{service\_accept}, \textsc{ua\_accept}, \textsc{ua\_failure}, \textsc{ua\_banner}, \textsc{ua\_pk\_ok}, \textsc{ua\_pw\_changereq}, \textsc{ch\_success} and \textsc{ch\_failure} in our alphabet.} Furthermore, from the Connection layer we only select general inputs for channel management plus those relating to the terminal functionality. -As mentioned previously, the starting state is already one where SSH versions have been exchanged, thus we also exclude these messages from our alphabet as they are no longer useful -in exploring SSH's behavior once the initial exchange took place. From the inputs defined, we make a selection of essential inputs. The selection forms the \textit{restricted alphabet}, which we use in some experiments. The restricted alphabet significantly decreases the number of queries needed to learn while only marginally limiting explored behavior. We touch on this again in the Section~\ref{sec:result}. Inputs included in the restricted alphabet are marked by '*'. +The alphabet we use consists of inputs, which correspond to messages +sent to the server, and outputs, which correspond to messages received +from the server. We split the input alphabet into three parts, one +for rach of the protocol layers. +\marginpar{\tiny Erik: the output alphabet is not discussed anywhere, +but for the discussion of the mapper in the next section it should be} - %messages which don't adhere -%to the binary packet p -%We reduce the alphabet further by only selecting inputs which follow the binary packet protocol, hence we don't include the identification input which should -%be sent by both client and server at the start of every connection. The exchange of these inputs is made implicit. +Learning does not scale with a growing alphabet, and since we are only +learning models of servers, we remove those inputs that are not +intended to ever be sent to the server\footnote{This means we exclude +the messages \textsc{service\_accept}, \textsc{ua\_accept}, +\textsc{ua\_failure}, \textsc{ua\_banner}, \textsc{ua\_pk\_ok}, +\textsc{ua\_pw\_changereq}, \textsc{ch\_success} and +\textsc{ch\_failure} from our alphabet.} Furthermore, from the +Connection layer we only use messages for channel management and the +terminal functionality. Finally, because we will only explore +protocol behaviour after SSH versions have been exchanged, we exclude +these messages for exchaning version numbers. \marginpar{\tiny Erik: I +rephrased all this to make it simpler. Is it still ok?} +The resulting lists of inputs for the three protocol layers are given +in tables~\ref{trans-alphabet}-\ref{conn-alphabet}. In some +experiments, we used only a subset of the most essential inputs, to +further speed up experiments. This \textit{restricted alphabet} +significantly decreases the number of queries needed for learning +models while only marginally limiting explored behavior. We discuss +this again in Section~\ref{sec:result}. Inputs included in the +restricted alphabet are marked by '*' in the tables below. Table~\ref{trans-alphabet} introduces the Transport layer inputs. We include a version of the \textsc{kexinit} message with \texttt{first\_kex\_packet\_follows} disabled. This means no guess~\cite[p. 17]{rfc4253} is attempted on the {\dsut}'s parameter preferences. Consequently, the {\dsut} will have to send its own \textsc{kexinit} in order to @@ -109,17 +126,69 @@ The restricted alphabet only supports the most general channel management inputs %table + \subsection{The mapper} -The {\dmapper} must provide translation between abstract message representations and well-formed SSH messages. A special case is for when no output is received from the {\dsut}, in which case the {\dmapper} gives back to the learner a \textsc{no\_resp} message, concluding a timeout occurred. The sheer complexity of the {\dmapper}, meant that it was easier to adapt an existing SSH implementation, rather than construct the {\dmapper} from scratch. Paramiko already provides mechanisms for encryption/decryption, as well as routines for sending the different types of packets, and for receiving them. These routines are called by control logic dictated by Paramiko's own state machine. The {\dmapper} was constructed by replacing this control logic with one dictated by messages received from the {\dlearner}. %over a socket connection -The {\dmapper} maintains a set of state variables which have initial values, and which are updated on specific outputs and used to concretize certain inputs. On receiving a \textsc{kexinit}, the {\dmapper} saves the {\dsut}'s parameter preferences. These preferences define the key exchange, hashing and encryption algorithms supported by the {\dsut}. Before such receipt, these parameters are defaulted to those that any implementation should support, as required by the RFC. Based on these parameters, a key exchange algorithm may be run. The {\dmapper} supports Diffie-Hellman, which it can initiate via a \textsc{kex30} input from the learner. The {\dsut} responds with \textsc{kex31} if the inputs were orderly sent. From \textsc{kex31}, the {\dmapper} saves the hash, as well as the new keys. Receipt of the \textsc{newkeys} response from the {\dsut} will make the {\dmapper} use the new keys earlier negotiated in place of the older ones, if such existed. The {\dmapper} contains two other state variables, -one is a buffer for storing open channels. It is initially empty and is increased/decreased on \textsc{ch\_open} and \textsc{ch\_close} inputs respectively. The other is initially set to 0, and stores the sequence number of the last received message. This is then used when constructing \textsc{unimpl} inputs. +The {\dmapper} must provide a translation between abstract messages +and well-formed SSH messages: it has to translate abstract inputs +listed in tables~\ref{trans-alphabet}-\ref{conn-alphabet} to actual +SSH packets, and translate the SSH packets received in answer +to our abstract outputs. + +A special case here occurs when no output is received from the +{\dsut}; in that case the {\dmapper} gives back to the learner a +\textsc{no\_resp} message, to indicate that a time-out occurred. + +The sheer complexity of the {\dmapper} meant that it was easier to +adapt an existing SSH implementation, rather than construct the +{\dmapper} from scratch. Paramiko already provides mechanisms for +encryption/decryption, as well as routines for constructing and +sending the different types of packets, and for receiving them. These +routines are called by control logic dictated by Paramiko's own state +machine. The {\dmapper} was constructed by replacing this control +logic with one dictated by messages received from the {\dlearner}. +%over a socket connection + +The {\dmapper} maintains a set of state variables to record parameters +of the ongoing session, including for example the server's preferences +for key exchange and encryption algorithm, parameters of these +protocols, and -- once is has been established -- the session key. +These parameters are updated when receiving messages from the server, +and are used to concretize inputs to actual SSH messages to the server. -In certain scenarios, inputs are answered by the {\dmapper} directly instead of being sent to the {\dsut}. These scenarios include the following: +For example, upon receiving a \textsc{kexinit}, the {\dmapper} saves +the {\dsut}'s preferences for key exchange, hashing and encryption +algorithms. Initially these parameters are all set to the defaults +that any server should support, as required by the RFC. The +{\dmapper} supports Diffie-Hellman key exchange, which it will +initiate if it gets a \textsc{kex30} input from the learner. +After this, the {\dsut} responds with a \textsc{kex31} message +(assuming the protocol run so far is correct), and from this +message, the {\dmapper} saves the hash, as well as the new +keys. Receipt of the \textsc{newkeys} response from the {\dsut} will +make the {\dmapper} use the new keys earlier negotiated in place of +the older ones, if such existed. + +The {\dmapper} contains two other +state variables, +one is a buffer for storing open channels. It is initially empty and +is increased/decreased on \textsc{ch\_open} and \textsc{ch\_close} +inputs respectively. The other is initially set to 0, and stores the +sequence number of the last received message. This is then used when +constructing \textsc{unimpl} inputs. \marginpar{\tiny Erik: I don't get this +bit} + +In the following cases, inputs are answered by the {\dmapper} directly +instead of being sent to the {\dsut} fo find out its response: \begin{enumerate} -\item if connection with the {\dsut} was terminated, the {\dmapper} responds with a \textsc{no\_conn} message -\item no channel has been opened (the buffer variable is empty) or the maximum number of channels was reached (in our experiments 1), cases which prompt the {\dmapper} +\item if connection with the {\dsut} was terminated, the {\dmapper} + responds with a \textsc{no\_conn} message, as sending furtheer + messages to the {\dsut} is pointless in that case; +\item if no channel has been opened (the buffer variable is empty) or the maximum number of channels was reached (in our experiments 1), cases which prompt the {\dmapper} to respond with \textsc{ch\_none}, or \textsc{ch\_max} respectively + \marginpar{\tiny Erik: i don't get this 2nd bullet; something is + missing?} + \end{enumerate} Overall, we notice that in many ways, the {\dmapper} acts similarly to an SSH client. Hence it is unsurprising that it was built off an existing @@ -127,25 +196,78 @@ implementation. \subsection{Compacting SSH into a small Mealy machine} -The {\dmapper} not only provides abstraction, it also ensures that the abstract representation shown to the learner behaves like a deterministic Mealy Machine. -This is needed, as SSH implementations are inherently \textit{non-determistic}. Sources of non-determinism can be divided into three categories: + +There are three practical complications in learning models of SSH +servers: (1) an SSH server may exhibit \emph{non-determistic} behaviour; (2) +a single input to the server can produce a \emph{sequence} of outputs +ratheer than just a single output, and (3) \emph{buffering} behaviour of the +server. These complication are discussed below. + +SSH implementations can exhibit non-determistic behaviour. The +learning algorithm cannot deal with non-determinism, so the {\dmapper} +not only provides abstraction, but it also ensures\marginpar{\tiny +Erik:'checks' instead of 'ensures'??} that the abstract representation +shown to the learner behaves a deterministic Mealy machine. +There are a few sources of non-determinism: \begin{enumerate} -\item SSH's \textit{protocol design} is inherently non-deterministic. Firstly, because underspecification leads to multiple options for developers, from which one can be selected in a non-deterministic manner. Secondly, because non-deterministic behaviour directly results from the specifications. An example of the latter is allowing to insert \textsc{debug} and \textsc{ignore} messages at any given time. -\item \textit{Response timing} is a source of non-determinism as well. For example, the {\dmapper} might conclude a timeout before the {\dsut} had sent its response. We had to set -timeout values accordingly, so that enough time was allowed for the response to be received. -\item Other \textit{timing-related quirks} can cause non-deterministic behaviour as well. Some {\dsuts} behave unexpectedly when a new query is received shortly after the previous one. +\item Underspecification in the SSH specification (for example, by not + specifying the order of certain messages) allows + non-deterministic behaviour of implementations. Even if client + and server do implement a fixed order for messages they sent, the + asynchronous nature of comunication between them means the + interleaving of messages sent and received may vary. Moreover, + client and server are free to intersperse \textsc{debug} and + \textsc{ignore} messages at any given time; the \textsc{ignore} + messages are aimed to thwart traffic analysis. +\item Timing is another source of non-deterministic behaviour. For + example, the {\dmapper} might time-out before the {\dsut} had + sent its response. + + In our experiments we tried to set time-out periods so that this + did not occur. However, other timing-related quirks can still + cause non-deterministism. For example, {\dsuts} behave + unexpectedly when a new query is received too shortly after the + previous one. %For example, a trace in which a valid user authentication is performed within five milliseconds after an authentication request on DropBear can cause the authentication to (wrongly) fail. \end{enumerate} +% +To detect non-determinism, we cache all new observations in an SQLite database and verify observations against this cache. +\marginpar{\tiny Erik: the text does not explain what we do if non-determinism +is detected. Do we ignore the new trace and report the old one from the +database? And/or warn that non-determinism is happening?} +The cache also enables us to answer to queries answered before +without running inputs on the {\dsut}. This transcended consecutive +learning runs, as we could reload the cache from a previous run, +enabling quick response of all queries up to the point the previous +run ended.\marginpar{\tiny Erik: but the trace still has to be +re-run, right, if we want to extend it beyond the end of the previous +run?} -To detect non-determinism, we cache all new observations in an SQLite database and verify observations against this cache. The cache also enables us to answer to queries answered before -without running inputs on the {\dsut}. This transcended consecutive learning runs, as we could reload the cache from a previous run, enabling quick response of all queries up to the point the previous run ended. %A subsequent identical learning run can quickly resume from where the previous one was ended, as the cache from the previous run is used to quickly respond to all queries up to the point the previous run ended. -Aside from non-determinism, SSH implementations can also produce a sequence of outputs in response to an input, whereas Mealy machines allow for only one output. To that end, the {\dmapper} -concatenates all outputs into one, delivering a single output to the {\dlearner}. +An SSH server may also produce a sequence of outputs in response to an +input. This means it is not behaving as a Mealy machines, which allow +allow for only one output. Dealing with this is simple: the {\dmapper} +concatenates all outputs into one, and it produces this sequence +as the single output to the {\dlearner}. -Another challenge is presented by buffering, leading to an explosion of states, as buffers cannot be described succintly by Mealy Machines. -We have encountered buffers in two occasions. Firstly, some implementations buffer certain responses when in key re-exchange. As soon as rekeying completes, these queued messages are released all at once. This leads to a \textsc{newkeys} response (indicating rekeying has completed), directly followed by all buffered responses. This, combined with concatenation of the multiple output responses would lead to non-termination of the learning algorithm, as for every variant of the response queue there would be a different output. To counter this, we replace the concatenation of queued output responses by the single string \textsc{buffered}, thus forming \textsc{newkeys\_buffered}. +Another challenge is presented by buffering, which leads to an +explosion of states in learning, as buffers cannot be described +succinctly by Mealy machines. +We have encountered buffering in two situations. Firstly, some +implementations buffer responses during a key re-exchange; only once +the rekeying is complete, are all these queued responses released, all +at once. This leads to a \textsc{newkeys} response (indicating +rekeying has completed), directly followed by all buffered responses. +This would lead to non-termination of the learning algorithm, as for +every sequence of buffered messages there is a different respone. To +prevent this, we treat the sequence of queued responses +as a single output \textsc{buffered}. -Buffer behaviour can also be observed when opening and closing channels, since a SUT can close only as many channels as have previously been opened. For this reason, we restrict the number of simultaneously open channels to one. The {\dmapper} returns a custom response \textsc{ch\_max} to a \textsc{ch\_open} message whenever this limit is reached. \ No newline at end of file +Secondly, buffering happens when opening and closing channels, +since a SUT can close only as many channels as have previously been +opened. For this reason, we restrict the number of simultaneously open +channels to one. The {\dmapper} returns a custom response +\textsc{ch\_max} to a \textsc{ch\_open} message whenever this limit is +reached. diff --git a/preliminaries.tex b/preliminaries.tex index c9ded72..36c5bb4 100644 --- a/preliminaries.tex +++ b/preliminaries.tex @@ -26,10 +26,13 @@ input enabled and deterministic. %\end{figure} \subsection{MAT Framework} \label{ssec:mat} + The most efficient algorithms for model learning all follow the pattern of a \emph{minimally adequate teacher (MAT)} as proposed by Angluin~\cite{Angluin1987Learning}. -In the MAT framework, learning is viewed as a game in which a \emph{learner} has to infer an unknown automaton by asking queries to a teacher. The teacher knows the automaton, which in our setting is a deterministic Mealy machine $\M$. -Initially, the {\dlearner} only knows the inputs $I$ and outputs $O$ of $\M$. +Here learning is viewed as a game in which a \emph{learner} has to infer an unknown automaton by asking queries to a teacher. The teacher knows the automaton, which in our setting is a deterministic Mealy machine $\M$, +also called the System Under Learning ({\dsut}). +Initially, the {\dlearner} only knows the input alphabet $I$ and +output alphabet $O$ of $\M$. The task of the {\dlearner} is to learn $\M$ through two types of queries: \begin{itemize} \item @@ -41,9 +44,15 @@ whether $\CH \approx \M$. The teacher answers \emph{yes} if this is the case. Ot \emph{no} and supplies a \emph{counterexample}, which is a sequence $\sigma \in I^{\ast}$ that triggers a different output sequence for both Mealy machines, that is, $A_{\CH}(\sigma) \neq A_{\M}(\sigma)$. \end{itemize} +% +If $\M$ is treated as a black box, the equivalence query can only be +approximated by a \emph{test query}, which uses conformance testing +\cite{LeeY96} -- more specifically, model-based testing -- +to look for a counterexample with a finite number of queries. Note +that this cannot exclude the possibility that there is more behaviour +that has not been discovered. For a recent overview of model learning +algorithms for this setting see \cite{Isberner2015}. -Model learning algorithms have been developed developed for learning deterministic Mealy machines using -a finite number of queries. We point to \cite{Isberner2015} for a recent overview. These algorithms are leveraged in applications where one wants to learn a model of a black-box reactive system, or System Under Learning ({\dsut}). The teacher typically consists of the {\dsut}, which answers membership queries, and a conformance testing tool \cite{LeeY96} that approximates equivalence queries using a set @@ -101,4 +110,4 @@ concrete inputs to abstract inputs and concrete outputs to abstract outputs. For %\item The transport layer protocol. This creates the basis for communication between server and client, providing a key exchange protocol and server authentication. The key exchange protocol is performed through three roundtrips. During the first, both client and server send a KEXINIT message. Then, the client sends a KEX30 message, the server responds with a KEX31 message. Finally, both parties send a NEWKEYS message, which indicates that the keys sent in the second step can be used. %\item The user authentication protocol. This component is used to authenticate a client to the server, for example, through a username and password combination, or through SSH-keys. %\item The connection protocol. This is used to provide different services to the connected client, it can thus multiplex the encrypted channel into different channels. The provided services can be services like file transfer or a remote terminal. Typical messages are requests for opening or closing channels, or requests for earlier named services. -%\end{itemize} \ No newline at end of file +%\end{itemize} diff --git a/security_definitions.tex b/security_definitions.tex index d8f05d9..c4edf77 100644 --- a/security_definitions.tex +++ b/security_definitions.tex @@ -29,8 +29,8 @@ This function updates the output and state variables for a given valuation of th %} %\lstset{showspaces=true} -\begin{figure}[h] +\begin{figure}[h] \centering %\begin{subfigure} \begin{tikzpicture}[>=stealth',shorten >=1pt,auto,node distance=2.8cm] @@ -70,6 +70,7 @@ This function updates the output and state variables for a given valuation of th \label{fig:nusmvex} \end{figure} + The remainder of this section defines the properties we formalized and verified. We group these properties into four categories: \begin{enumerate} @@ -83,7 +84,7 @@ A key note is that properties are checked not on the actual concrete model of th \subsection{Basic characterizing properties} %cannot be translated. %Though in practical terms, these results are still translatable, in particular for cases where properties are not met. -In our setting, once the connection is lost, it can no longer be recovered and the {\dmapper} will itself respond with \textsc{no\_conn} to any subsequent non-Connection layer inputs sent by the {\dlearner}. This behavior is described by Property~\ref{prop:noconn}, where \emph{isConnectionInput} is a predicate which only holds if the input supplied is a Connection layer input. The reason we exclude connection inputs is due to a mapper characteristic we touched on in Section~\ref{sec:result}. The {\dmapper} maintains a buffer of opened channels and limits its size to 1. From the perspective of the {\dmapper}, a channel is open, and thus added to the buffer, whenever \textsc{ch\_open} is received from the learner, regardless if a channel was actually opened on the {\dsut}. If an attempt to open an additional channel is made, the {\dmapper} itself responds by \textsc{ch\_max} without querying the {\dsut}. Conversely, if there is no channel open (the buffer is empty) and an input operating on a channel is sent, the {\dmapper} responds by \textsc{ch\_none}, again, without querying the {\dsut}. Additionally, a channel opened on the {\dmapper} is closed and removed from the buffer on a {ch\_close} from the {\dlearner}, with a corresponding SSH CHANNEL CLOSE message being sent to the {\dsut}. We use Property~\label{prop:channel} to describe this mapper induced behavior. +In our setting, once the connection is lost, it can no longer be recovered and the {\dmapper} will itself respond with \textsc{no\_conn} to any subsequent non-Connection layer inputs sent by the {\dlearner}. This behavior is described by Property~\ref{prop:noconn}, where \emph{isConnectionInput} is a predicate which only holds if the input supplied is a Connection layer input. The reason we exclude connection inputs is due to a mapper characteristic we touched on in Section~\ref{sec:result}. The {\dmapper} maintains a buffer of opened channels and limits its size to 1. From the perspective of the {\dmapper}, a channel is open, and thus added to the buffer, whenever \textsc{ch\_open} is received from the learner, regardless if a channel was actually opened on the {\dsut}. If an attempt to open an additional channel is made, the {\dmapper} itself responds by \textsc{ch\_max} without querying the {\dsut}. Conversely, if there is no channel open (the buffer is empty) and an input operating on a channel is sent, the {\dmapper} responds by \textsc{ch\_none}, again, without querying the {\dsut}. Additionally, a channel opened on the {\dmapper} is closed and removed from the buffer on a {ch\_close} from the {\dlearner}, with a corresponding SSH CHANNEL CLOSE message being sent to the {\dsut}. We use Property~\ref{prop:channel} to describe this mapper induced behavior. \begin{property}%[h] diff --git a/sigproc.bib b/sigproc.bib index a0249eb..e112ebb 100644 --- a/sigproc.bib +++ b/sigproc.bib @@ -29,10 +29,10 @@ @inproceedings{SMJV15, author = {W. Smeenk and J. Moerman and D.N. Jansen and F.W. Vaandrager}, booktitle = {Proceedings 17th International Conference on Formal Engineering Methods (ICFEM 2015), Paris, 3-6 November 2015}, - series = {Lecture Notes in Computer Science}, + series = {LNCS}, volume = 9407, pages = {1--17}, - publisher = {Springer-Verlag}, + publisher = {Springer}, year = 2015, editor = {M. Butler and S. Conchon and F. Zaidi}, title = {Applying Automata Learning to Embedded Control Software} @@ -40,22 +40,12 @@ @MastersThesis{Boss2012, title={Evaluating implementations of SSH by means of model-based testing}, - author={Lenaerts, T}, + author={Lenaerts, T.}, year={2012}, - document_type = {Bachelor's Thesis}, +document_type = {Bachelor's Thesis}, type = {Bachelor's Thesis}, howpublished = {Online \url{https://pdfs.semanticscholar.org/8841/47071555f50b614c6af640cea5152fee10f2.pdf}}, - journal = {Radboud University} -} - -@article{Poll_rigorous_2011, - title = {Rigorous specifications of the {SSH} {Transport} {Layer}}, - url = {http://www.cs.kun.nl/~erikpoll/publications/ssh.pdf}, - urldate = {2017-02-13}, - journal = {Radboud University Nijmegen, Tech. Rep. ICIS–R11004}, - author = {Poll, Erik and Schubert, Aleksy}, - year = {2011}, - keywords = {ssh}, + school = {Radboud University} } @article{Udrea_rule-based_2008, @@ -83,7 +73,7 @@ document_type = {Bachelor's Thesis}, type = {Bachelor's Thesis}, howpublished = {Online}, - journal = {Radboud University} + school = {Radboud University} } @@ -96,10 +86,10 @@ @MastersThesis{Verleg2016, title={Inferring SSH state machines using protocol state fuzzing}, - author={Verleg, P}, + author={Verleg, P.}, year={2016}, howpublished = {Online}, - journal = {Radboud University} + school = {Radboud University} } @article{rfc760, @@ -146,8 +136,8 @@ pages = {188--204}, posted-at = {2015-12-07 11:15:39}, priority = {2}, - publisher = {Springer Berlin Heidelberg}, - series = {Lecture Notes in Computer Science}, + publisher = {Springer}, + series = {LNCS}, title = {Generating Models of {Infinite-State} Communication Protocols Using Regular Inference with Abstraction}, url = {http://dx.doi.org/10.1007/978-3-642-16573-3_14}, volume = {6435}, @@ -195,7 +185,7 @@ machine learning algorithms}, issn= {0925-9856}, doi= {10.1007/s10703-014-0216-x}, url= {http://dx.doi.org/10.1007/s10703-014-0216-x}, - publisher= {Springer US}, + publisher= {Springer}, keywords= {Active automata learning; Mealy machines; Abstraction techniques; Communication protocols; Session initiation protocol; Transmission control protocol}, volume = 46, number = 1, @@ -233,7 +223,7 @@ machine learning algorithms}, booktitle={International Colloquium on Theoretical Aspects of Computing}, pages={165--183}, year={2015}, - organization={Springer International Publishing} + organization={Springer} } @inproceedings{ralib2015, @@ -248,10 +238,11 @@ machine learning algorithms}, file = {2015-RALib A LearnLib extension for inferring EFSMs.pdf:C\:\\Users\\Paul\\AppData\\Roaming\\Zotero\\Zotero\\Profiles\\tt1zn5x1.default\\zotero\\storage\\TQUR4TS7\\2015-RALib A LearnLib extension for inferring EFSMs.pdf:application/pdf} } -@article{Chalupar2014Automated, +@inproceedings{Chalupar2014Automated, author = {Chalupar, Georg and Peherstorfer, Stefan and Poll, Erik and de Ruiter, Joeri}, citeulike-article-id = {13837720}, - journal = {WOOT'14 Proceedings of the 8th USENIX conference on Offensive Technologies}, + booktitle = {Proceedings of the 8th USENIX workshop on + Offensive Technologies (WOOT'14)}, pages = {1--10}, posted-at = {2015-11-13 14:58:54}, priority = {2}, @@ -270,8 +261,8 @@ machine learning algorithms}, pages = {673--686}, posted-at = {2015-11-13 14:56:16}, priority = {2}, - publisher = {Springer Berlin Heidelberg}, - series = {Lecture Notes in Computer Science}, + publisher = {Springer}, + series = {LNCS}, title = {Inference and Abstraction of the Biometric Passport}, url = {http://dx.doi.org/10.1007/978-3-642-16558-0_54}, volume = {6415}, @@ -370,8 +361,8 @@ machine learning algorithms}, pages = {345--361}, posted-at = {2015-11-13 12:20:10}, priority = {2}, - publisher = {Springer Berlin Heidelberg}, - series = {Lecture Notes in Computer Science}, + publisher = {Springer}, + series = {LNCS}, title = {{Plaintext-Dependent} Decryption: A Formal Security Treatment of {SSH}-{CTR}}, url = {http://dx.doi.org/10.1007/978-3-642-13190-5_18}, volume = {6110}, @@ -389,8 +380,8 @@ machine learning algorithms}, pages = {356--374}, posted-at = {2015-11-13 12:16:22}, priority = {2}, - publisher = {Springer Berlin Heidelberg}, - series = {Lecture Notes in Computer Science}, + publisher = {Springer}, + series = {LNCS}, title = {Analysis of the {SSH} Key Exchange Protocol}, url = {http://dx.doi.org/10.1007/978-3-642-25516-8_22}, volume = {7089}, @@ -454,7 +445,6 @@ machine learning algorithms}, citeulike-linkout-0 = {http://dx.doi.org/10.1109/icstw.2013.60}, citeulike-linkout-1 = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6571671}, doi = {10.1109/icstw.2013.60}, - institution = {Inst. for Comput. \& Inf. Sci., Radboud Univ. Nijmegen, Nijmegen, Netherlands}, isbn = {978-1-4799-1324-4}, month = mar, pages = {461--468}, @@ -475,7 +465,7 @@ machine learning algorithms}, title="Combining Model Learning and Model Checking to Analyze TCP Implementations", bookTitle="Computer Aided Verification: 28th International Conference, CAV 2016, Toronto, ON, Canada, July 17-23, 2016, Proceedings, Part II", year="2016", - publisher="Springer International Publishing", + publisher="Springer", address="Cham", pages="454--471", isbn="978-3-319-41540-6", @@ -507,7 +497,6 @@ machine learning algorithms}, priority = {2}, publisher = {USENIX Association}, title = {{Not-Quite}-{So-Broken} {TLS}: Lessons in {Re-Engineering} a Security Protocol Specification and Implementation}, - url = {https://www.usenix.org/conference/usenixsecurity15/technical-sessions/presentation/kaloper-mersinjak}, year = {2015} } @@ -523,12 +512,10 @@ machine learning algorithms}, priority = {2}, publisher = {USENIX Association}, title = {Protocol State Fuzzing of {TLS} Implementations}, - url = {https://www.usenix.org/conference/usenixsecurity15/technical-sessions/presentation/de-ruiter}, year = {2015} } -@unpublished{Poll2011Rigorous, - abstract = {Abstract. This document presents (semi-)formal specifications of the security protocol {SSH}, more specifically the transport layer protocol, and describe a source code review of {OpenSSH}, the leading implementation of {SSH}, using these specifications. Our specifications, in the form of finite state machines, are at a different level of abstraction that the typical formal descriptions used to study security protocols. Our motivation is to understand actual implementations of {SSH}, so we try to capture some of the details from the official (informal) specification that are irrelevant to the security of the abstract protocol, but which do complicate the implementation. Our specifications should be useful to anyone trying to understand or implement {SSH}. First versions of our specifications were developed for the formal verification of a Java implementation of {SSH} [17]. 1}, +@techreport{Poll_rigorous_2011, author = {Poll, Erik and Schubert, Aleksy}, citeulike-article-id = {13778664}, citeulike-linkout-0 = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.194.1815}, @@ -536,7 +523,8 @@ machine learning algorithms}, priority = {2}, title = {Rigorous specifications of the {SSH} Transport Layer}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.194.1815}, - year = {Radboud University, 2011} + year = {2011}, + institution = {Radboud University} } @article{Pasareanu2008Learning, @@ -558,7 +546,7 @@ machine learning algorithms}, pages = {175--205}, posted-at = {2015-09-30 07:58:08}, priority = {2}, - publisher = {Springer US}, + publisher = {Springer}, title = {Learning to divide and conquer: applying the {L}* algorithm to automate assume-guarantee reasoning}, url = {http://dx.doi.org/10.1007/s10703-008-0049-6}, volume = {32}, @@ -576,8 +564,8 @@ machine learning algorithms}, pages = {1--18}, posted-at = {2015-09-30 07:56:06}, priority = {2}, - publisher = {Springer Berlin Heidelberg}, - series = {Lecture Notes in Computer Science}, + publisher = {Springer}, + series = {LNCS}, title = {Inferring Protocol State Machine from Network Traces: A Probabilistic Approach}, url = {http://dx.doi.org/10.1007/978-3-642-21554-4_1}, volume = {6715}, @@ -639,10 +627,18 @@ machine learning algorithms}, pages = {75--90}, posted-at = {2015-09-30 07:41:58}, priority = {2}, - publisher = {Springer US}, + publisher = {Springer}, series = {IFIP — The International Federation for Information Processing}, title = {Automatic executable test case generation for extended finite state machine protocols}, url = {http://dx.doi.org/10.1007/978-0-387-35198-8_6}, year = {1997} } +@inproceedings{PollSchubert07, + author = {E. Poll and A. Schubert}, + title = {Verifying an implementation of {SSH}}, + booktitle = {WITS'07}, + year = {2007}, + pages = {164-177}, +} + diff --git a/the_sut.tex b/the_sut.tex index 6b6bbdf..8301574 100644 --- a/the_sut.tex +++ b/the_sut.tex @@ -2,19 +2,17 @@ The Secure Shell Protocol (or SSH) is a protocol used for secure remote login and other secure network services over an insecure network. It is an application layer protocol running on top of TCP, which provides reliable data transfer, but does not provide any form of connection security. The initial version of SSH was superseded by a second version (SSHv2), as the former was found to contain design flaws~\cite{FutoranskyAttack} which could not be fixed without losing backwards compatibility. This work focuses on SSHv2. -SSHv2 follows a client-server paradigm consisting of three components (Figure~\ref{fig:sshcomponents}): -\begin{itemize} +SSHv2 follows a client-server paradigm. The protocol consists of three layers (Figure~\ref{fig:sshcomponents}): +\begin{enumerate} \item The \textit{transport layer protocol} (RFC 4253~\cite{rfc4253}) forms the basis for any communication between a client and a server. It provides confidentiality, integrity and server authentication as well as optional compression. \item The \textit{user authentication protocol} (RFC 4252~\cite{rfc4252}) is used to authenticate the client to the server. \item The \textit{connection protocol} (RFC 4254~\cite{rfc4254}) allows the encrypted channel to be multiplexed in different channels. These channels enable a user to run multiple processes, such as terminal emulation or file transfer, over a single SSH connection. -\end{itemize} +\end{enumerate} -Each layer has its own specific messages. The SSH protocol is interesting in that outer layers do not encapsulate inner layers. This means that different layers can interact. Hence, it makes sense to analyze SSH as a whole, instead of analyzing its constituent layers independently. We review each layer, outlining the relevant messages which are later used in learning. We refer to as \textit{happy flow}, -the common case of a successful operation. +Each layer has its own specific messages. The SSH protocol is interesting in that outer layers do not encapsulate inner layers. This means that different layers can interact. Hence, it makes sense to analyze SSH as a whole, instead of analyzing its constituent layers independently. We review each layer, outlining the relevant messages which are later used in learning, and characterising the so-called \textit{happy flow} that a normal protocol +run follows. -The SSH protocol happy flow at a high level consists of the following steps: (1) the client first establishes a TCP connection with the server, (2) the two sides negotiate encryption algorithms, and subsequently establish one-time session keys via a negotiated key exchange method, further communication is secured using these keys, (3) the client authenticates to the server by providing valid credentials and finally, (4) the client accesses services on the server like for example the terminal service. Leaving out the first step, each step that follows is an integral part of each SSH layer. - -%perhaps room for figure +At a high level, a typical SSH protocol run uses the three constituent protocols in the order given above: after the client establishes a TCP connection with the server, (1) the two sides use the transport layer protocol to negotiate key exchange and encryption algorithms, and use these to establish session keys which are then used to secure further communication; (2) the client uses the user authentication protocol to authenticate to the server; (3) the client uses the connection protocol to accesses services on the server, for example the terminal service. %Different layers are identified by their message numbers. These message numbers will form the basis of the state fuzzing. The SSH protocol is especially interesting because outer layers do not encapsulate inner layers. This means that different layers can interact. One could argue that this is a less systematic approach, in which a programmer is more likely to make state machine-related errors. @@ -26,7 +24,7 @@ The SSH protocol happy flow at a high level consists of the following steps: (1) \end{figure} \subsection{Transport layer}\label{ssh-run-trans} -SSH runs over TCP, and provides end-to-end encryption based on pseudo-random keys. Once a TCP connection has been established with the server, these keys are securely negotiated as part of a \textsl{key exchange} method, the first step of the protocol. Key exchange begins by the two sides exchanging their preferences for negotiable parameters relating to the actual key exchange algorithm used, as well as encryption, compression and hashing algorithms preferred and supported by either side. Preferences are sent via the \textsc{kexinit} message. Subsequently, key exchange using the negotiated algorithm takes place. Following this algorithm, one-time session keys for encryption and hashing are generated by each side, together with an identifier for the session. Diffie-Hellman is the main key exchange algorithm, and the only one required for support by the RFC. Under the Diffie-Hellman scheme, \textsc{kex30} and \textsc{kex31} are exchanged and new session keys are produced. These keys are used from the moment the \textsc{newkeys} command has been issued by both parties. A subsequent \textsc{sr\_auth} requests the authentication service. The happy flow thus consists of the succession of the three steps comprising key exchange, followed up by a successful authentication service request. The sequence is shown in Figure~\ref{fig:hf-trans}. +SSH runs over TCP, and provides end-to-end confidentialty and integrity using pseudo-random session keys. Once a TCP connection has been established with the server, these session keys are securely negotiated as part of a \textsl{key exchange} method, the first step of the protocol. Key exchange begins by the two sides exchanging their preferences for the key exchange algorithm used, as well as encryption, compression and hashing algorithms. Preferences are sent with a \textsc{kexinit} message. Subsequently, key exchange using the negotiated algorithm takes place. Following this algorithm, one-time session keys for encryption and hashing are generated by each side, together with an identifier for the session. Diffie-Hellman is the main key exchange algorithm, and the only one required for support by the RFC. Under the Diffie-Hellman scheme, \textsc{kex30} and \textsc{kex31} are exchanged and new session keys are produced. These keys are used from the moment the \textsc{newkeys} command has been issued by both parties. A subsequent \textsc{sr\_auth} requests the authentication service. The happy flow thus consists of the succession of the three steps comprising key exchange, followed up by a successful authentication service request. The sequence is shown in Figure~\ref{fig:hf-trans}. \begin{figure}[!hb] \includegraphics[scale=0.285]{hf-trans.pdf} @@ -34,13 +32,15 @@ SSH runs over TCP, and provides end-to-end encryption based on pseudo-random key \label{fig:hf-trans} \end{figure} -\textsl{Key re-exchange}~\cite[p. 23]{rfc4253}, or \textsl{rekey}, is a near identical process, with the difference being that instead of taking place at the beginning, it takes place once session keys are already in place. The purpose is to renew session keys so as to foil potential replay attacks~\cite[p. 17]{rfc4251}. It follows the same steps as key exchange. Messages exchanged as part of it are encrypted using the old set of keys, messages exchanged afterward are encrypted using the new keys. A key property of rekey is that it should preserve the state. That is, the state of the protocol before a rekey should be the same as afterward, with the difference that new keys are now in use. %Some implementations are known not support rekey in certain states of the protocol. +\textsl{Key re-exchange}~\cite[p. 23]{rfc4253}, or \textsl{rekeying}, is a near identical process, with the difference being that instead of taking place at the beginning, it takes place once session keys are already in place. The purpose is to renew session keys so as to foil potential replay attacks~\cite[p. 17]{rfc4251}. It follows the same steps as key exchange. Messages exchanged as part of it are encrypted using the old set of keys, messages exchanged afterward are encrypted using the new keys. A fundamental property of rekeying is that it should preserve the state; that is, after the rekeying procedure is complemeted, the protocol should be in the same state as +it was before the rekeying started, with as only difference that new keys are now in use. %Some implementations are known not support rekeying in certain states of the protocol. %We consider an transport layer state machine secure if there is no path from the initial state to the point where the authentication service is invoked without exchanging and employing cryptographic keys. \subsection{Authentication layer}\label{ssh-run-auth} -Once the authentication service has been summoned, authentication can commence. To this end, RFC 4252~\cite{rfc4252} defines four authentication methods (password, public-key, host-based and none). The authentication request includes a user name, service name and authentication data, which consists of both the authentication method as well as the data needed to perform the actual authentication, such the password or public key. The happy flow for this layer, as shown in Figure~\ref{fig:hf-auth}, is defined as the sequence that results in a successful authentication. The messages \textsc{ua\_pw\_ok} and \textsc{ua\_pk\_ok} achieve this for respectively password or public key authentication. Figure~\ref{fig:hf-auth} presents the case involving password authentication. + +Once a secure tunnl has been established, the client can authenticate. For this RFC 4252~\cite{rfc4252} defines four authentication methods (password, public-key, host-based and none). The authentication request includes a user name, service name and authentication data, which consists of both the authentication method as well as the data needed to perform the actual authentication, such the password or public key. The happy flow for this layer, as shown in Figure~\ref{fig:hf-auth}, is simply a single protocol step that results in a successful authentication. The messages \textsc{ua\_pw\_ok} and \textsc{ua\_pk\_ok} achieve this for respectively password or public key authentication. Figure~\ref{fig:hf-auth} presents the case for password authentication. %We consider a user authentication layer state machine secure if there is no path from the unauthenticated state to the authenticated state without providing correct credentials. \begin{figure}[!ht] @@ -52,7 +52,9 @@ Once the authentication service has been summoned, authentication can commence. \subsection{Connection layer}\label{ssh-run-conn} -Successful authentication makes services of the Connection layer available. The Connection layer enables the user to open and close channels of different types, with each type providing access to specific services. From the various services available, we focus on the remote terminal over a session channel, a standout feature of SSH. The happy flow involves opening a session channel, \textsc{ch\_open}, requesting a ``pseudo terminal'' \textsc{ch\_request\_pty}, sending and managing data via the messages \textsc{ch\_send\_data}, \textsc{ch\_window\_adjust}, \textsc{ch\_send\_eof}, and eventually closing the channel via \textsc{ch\_close}. Figure~\ref{fig:hf-conn} depicts the common scenario. +Successful authentication makes services of the Connection layer available. The Connection layer enables the user to open and close channels of various types, with each type providing access to specific services. Of the various services available, we focus on the remote terminal over a session channel, a classica use of SSH. The happy flow consists of opening a session channel, \textsc{ch\_open}, requesting a ``pseudo terminal'' \textsc{ch\_request\_pty}, sending and managing data via the messages \textsc{ch\_send\_data}, \textsc{ch\_window\_adjust}, \textsc{ch\_send\_eof}, and eventually closing the channel via \textsc{ch\_close}, as depicted in Figure~\ref{fig:hf-conn}. +\marginpar{\tiny Erik: to match this text, the figure should include a cycle +for \textsc{ch\_send\_data}, \textsc{ch\_window\_adjust}, \textsc{ch\_send\_eof}??} %Because the connection protocol offers a wide range of functionalities, we it is hard to define a single happy flow. Requesting a terminal is one of the main features of SSH and has therefore been selected as the happy flow. This behaviour is typically triggered by the trace \textsc{ch\_open}; \textsc{ch\_request\_pty}. Other @@ -63,4 +65,4 @@ Successful authentication makes services of the Connection layer available. The \includegraphics[scale=0.35]{hf-conn.pdf} \caption{The happy flow for the connection layer.} \label{fig:hf-conn} -\end{figure} \ No newline at end of file +\end{figure} -- GitLab