Commit 1e3db523 authored by Alexander Henkel's avatar Alexander Henkel
Browse files

feedback impl

parent 660c9d2a
......@@ -880,4 +880,31 @@ DOI = {10.3390/s16010115}
publisher = {Springer},
address = {Berlin, Germany},
doi = {10.1007/978-3-540-85563-7_14}
}
@article{Xiong2016Nov,
author = {Xiong, Peng and Wang, Hongrui and Liu, Ming and Lin, Feng and Hou, Zengguang and Liu, Xiuling},
title = {{A stacked contractive denoising auto-encoder for ECG signal denoising}},
journal = {Physiol. Meas.},
volume = {37},
number = {12},
pages = {2214--2230},
year = {2016},
month = nov,
issn = {0967-3334},
publisher = {IOP Publishing},
doi = {10.1088/0967-3334/37/12/2214}
}
@article{Xiong2016Jun,
author = {Xiong, Peng and Wang, Hongrui and Liu, Ming and Zhou, Suiping and Hou, Zengguang and Liu, Xiuling},
title = {{ECG signal enhancement based on improved denoising auto-encoder}},
journal = {Eng. Appl. Artif. Intell.},
volume = {52},
pages = {194--202},
year = {2016},
month = jun,
issn = {0952-1976},
publisher = {Pergamon},
doi = {10.1016/j.engappai.2016.02.015}
}
\ No newline at end of file
\chapter*{Abstract}
Wearable sensors like smartwatches offer a good opportunity for human activity recognition (HAR). They are available to a wide user base and can be used in everyday life. Due to the variety of users, the detection model must be able to recognize different movement patterns. Recent research has demonstrated, that a personalized recognition tends to perform better than a general one. However, additional labeled data from the user is required which can be time consuming and labor intensive. While common personalization approaches try to reduce the necessary labeled training data, the labeling process remains dependent on some user interaction.
Wearable sensors like smartwatches offer a good opportunity for human activity recognition (HAR). They are available to a wide user base and can be used in everyday life. Due to the variety of users, the detection model must be able to recognize different movement patterns. Recent research has demonstrated that a personalized recognition tends to perform better than a general one. However, additional labeled data from the user is required which can be time consuming and labor intensive. While common personalization approaches try to reduce the necessary labeled training data, the labeling process remains dependent on some user interaction.
In this work, I present a personalization approach in which training data labels are derived from inexplicit user feedback obtained during the usual use of a HAR application. The general model predicts labels which are then refined by various denoising filters based on Convolutional Neural Networks and Autoencoders. This process is assisted by the previously obtained user feedback. High confidence data is then used for fine tuning the recognition model via transfer learning. No changes to the model architecture are required and thus personalization can easily be added to an existing application.
Analysis in the context of hand wash detection demonstrate, that a significant performance increase can be achieved. More over I compare my approach with a traditional personalization method to confirm the robustness. Finally I evaluate the process in a real world experiment where participants wore a smart watch on a daily basis for a month.
Analysis in the context of hand wash detection demonstrates, that a significant performance increase can be achieved. More over, I compare my approach with a traditional personalization method to confirm the robustness. Finally I evaluate the process in a real world experiment where participants wear a smart watch on a daily basis for a month.
\chapter{Zusammenfassung}
......
\chapter{Introduction}\label{chap:introduction}
Detecting and monitoring people's activities can be the basis for observing user behavior and well-being. Human Activity Recognition (HAR) is a growing research area in many fields like healthcare~\cite{Zhou2020Apr, Wang2019Dec}, elder care~\cite{Jalal2014Jul, Hong2008Dec}, fitness tracking~\cite{Nadeem2020Oct} or entertainment~\cite{Lara2012Nov}. Especially the technical improvements in wearable sensors like smart watches offer an integration in everyday life over a wide user base~\cite{Weiss2016Feb, Jobanputra2019Jan, Bulling2014Jan}.
Detecting and monitoring peoples activities can be the basis for observing user behavior and well-being. Human Activity Recognition (HAR) is a growing research area in many fields like healthcare~\cite{Zhou2020Apr, Wang2019Dec}, elder care~\cite{Jalal2014Jul, Hong2008Dec}, fitness tracking~\cite{Nadeem2020Oct} or entertainment~\cite{Lara2012Nov}. Especially the technical improvements in wearable sensors like smart watches offer an integration in everyday life over a wide user base~\cite{Weiss2016Feb, Jobanputra2019Jan, Bulling2014Jan}.
One of the application scenarios in healthcare can be the observation of various diseases such as Obsessive-Compulsive Disorder (OCD). For example the detection of hand washing activities can be used to derive the frequency or excessiveness which occurs in some people with OCD. More over it is possible to diagnose and even treat such diseases outside a clinical setting~\cite{Ferreri2019Dec, Briffault2018May}. If excessive hand washing is detected Just-in-Time Interventions can be presented to the user which offer an enormous potential for promoting health behavior change~\cite{10.1007/s12160-016-9830-8}.
One of the application scenarios in healthcare is the observation of various diseases such as Obsessive-Compulsive Disorder (OCD). For example the detection of hand washing activities can be used to derive the frequency or excessiveness which occurs in some people with OCD. More over it is possible to diagnose and even treat such diseases outside a clinical setting~\cite{Ferreri2019Dec, Briffault2018May}. If excessive hand washing is detected Just-in-Time Interventions can be presented to the user which offer an enormous potential for promoting health behavior change~\cite{10.1007/s12160-016-9830-8}.
State of the art Human Activiy Recognition methods are supervised deep neural networks derived from concepts like Convolutional Layers or LSTMs. These require lots of training data to archive good performance. Since movement patterns of each human are unique, the performance of activity detection can differ. So training data of a wide variety of humans would be necessary to generalize to new users. Therefore it has been shown that personalized models can achieve better accuracy against user-independent models ~\cite{Hossain2019Jul, Lin2020Mar}.
State of the art Human Activiy Recognition methods are supervised deep neural networks derived from concepts like Convolutional Layers or Long short-term memory (LSTM). These require lots of training data to achieve good performance. Since movement patterns of each human are unique, the performance of activity detection can differ. So training data of a wide variety of humans is necessary to generalize to new users. Therefore it has been shown that personalized models can achieve better accuracy against user-independent models ~\cite{Hossain2019Jul, Lin2020Mar}.
To personalize a model retraining on new unseen sensor data is necessary. Obtaining the ground truth labels is crucial for most deep learning techniques. However, the annotation process is time and cost-intensive. Typically training data is labeled in controlled environments by hand. In a real context scenario the user would have to take over the main part.
To personalize a model retraining on new unseen sensor data is necessary. Obtaining the ground truth labels is crucial for most deep learning techniques. However, the annotation process is time and cost-intensive. Typically, training data is labeled in controlled environments by hand. In a real context scenario the user would have to take over the main part.
Indeed this requires lots of user interaction and a decent expertise which would contradict the usability.
There has been different research in how to preprocess data to make it usable for training. It turned out that a good trade-off is semi-supervised-learning or active learning, where a general base model is used to label the data and in uncertain cases it relies on user interaction ~\cite{siirtola2019importance, Siirtola2019Nov}. Here a small part of labeled data is combined with a larger unlabeled part to improve the detection model. But still some sort of explicit user interaction is required for personalization. So there is a overhead in the usage of a HAR application.
The goal of my work was to personalize a detection model without increasing the user interaction. Information for labeling is drawn from indicators that arise during the use of the application. These can be derived by user feedback to triggered actions resulted from the predictions of the underlying recognition model. More over the personalization should be an additional and separated part, so no change of the model architecture is required.
The goal of my work is to personalize a detection model without increasing the user interaction. Information for labeling is drawn from indicators that arise during the use of the application. These can be derived by user feedback to triggered actions resulted from the predictions of the underlying recognition model. More over the personalization should be an additional and separated part, so no change of the model architecture is required.
At first, all new unseen sensor data is labeled by the same general model which is used for activity recognition. These model predictions are corrected to a certain extent by using pretrained filters. High confidence labels are considered for personalization. In addition, the previously obtained indicators are used to further refine the data to generate a valid training set. Therefore the process of manual labeling can be skipped and replaced by an automatic combination of available indications. With the newly collected and labeled training data the previous model can be fine tuned in a incremental learning approach ~\cite{Amrani2021Jan, Siirtola2019May, Sztyler2017Mar}. For neuronal networks it has been shown that transfer learning offers high performance with decent computation time ~\cite{Chen2020Apr}. All this leads to a personalized model which has improved performance in detecting specific gestures of an individual user.
At first, all new unseen sensor data is labeled by the same general model which is used for activity recognition. These model predictions are corrected to a certain extent by using pretrained filters. High confidence labels are considered for personalization. In addition, the previously obtained indicators are used to further refine the data to generate a valid training set. Therefore the process of manual labeling can be skipped and replaced by an automatic combination of available indications. With the newly collected and labeled training data the previous model can be fine tuned in a incremental learning approach ~\cite{Amrani2021Jan, Siirtola2019May, Sztyler2017Mar}. For neuronal networks it has been shown that transfer learning offers high performance with decent computation time ~\cite{Chen2020Apr}. In combination this leads to a personalized model which has improved performance in detecting specific gestures of an individual user.
I applied the described personalization process to a hand washing detection application which is used for observing the behavior of OCD patients. During the observation, the user answers requested evaluations if the application detects hand washing. For miss predictions the user has the opportunity to reject evaluations. Depending on how the user reacted to the evaluations, conclusions can be drawn about the correctness of the predictions, which leads to the required indicators.
I applied the described personalization process to a hand washing detection application which is used for observing the behavior of OCD patients. During the observation, the user answers requested evaluations if the application detects hand washing. For miss predictions the user has the opportunity to reject evaluations. Depending on how the user reacts to the evaluations, conclusions are drawn about the correctness of the predictions, which leads to the required indicators.
The contributions of my work are as follows:
......
......@@ -4,23 +4,23 @@ Human Activity Recognition (HAR) is a wide research field and is used in a varie
In the following I give a brief overview of literature about state of the art HAR and how personalization can improve the performance. Then I focus on work that deals with the generation of training data in different approaches. Last I show work that deals with cleaning faulty labels in the training data.
\section{Activity recognition}\label{sec:relWorkActivityRecognition}
Most used Inertial Measurement Units (IMUs) provide a combination of 3-axis acceleration and orientation data in continuous streams. Sliding windows are applied to the streams and are assigned to an activity by the underlying classifying technique ~\cite{s16010115}. This classifier is a prediction function $f(x)$ which returns the predicted activity labels for a given input $x$. Recently deep neural network techniques replace traditional ones such as Support vector machines or random forests since no hand crafted features are required ~\cite{ramasamy2018recent}. They use multiple hidden layers of feature decoders and an output layer which provides predicted class distributions ~\cite{MONTAVON20181}. Each layer consists of multiple artificial neurons which are connected to the neurons of the following layer. These connections are assigned a weight which is learned during the training process. First, in the feed-forward pass the values of the output are computed based on a batch of training data. In the second stage called back propagation, the error between the expected and predicted values are computed by a loss function $J$ to get minimized by optimization of the weights. This is repeated over multiple iterations~\cite{Liu2017Apr}.
Most used Inertial Measurement Units (IMUs) provide a combination of 3-axis acceleration and orientation data in continuous streams. Sliding windows are applied to the streams and are assigned to an activity by the underlying classifying technique ~\cite{s16010115}. This classifier is a prediction function $f(x)$ which returns the predicted activity labels for a given input $x$. Recently deep neural network techniques replace traditional ones such as Support Vector Machines or Random Forests since no hand crafted features are required ~\cite{ramasamy2018recent}. They use multiple hidden layers of feature decoders and an output layer which provides predicted class distributions ~\cite{MONTAVON20181}. Each layer consists of multiple artificial neurons which are connected to the neurons of the following layer. These connections are assigned a weight which is learned during the training process. First, in the feed-forward pass the values of the output are computed based on a batch of training data. In the second stage, called back propagation, the error between the expected and predicted values are computed by a loss function $J$ to get minimized by optimization of the weights. This is repeated over multiple iterations~\cite{Liu2017Apr}.
The combination of Convolutional Neural Networks (CNN) and Long-short-term memory recurrent neural networks (LSTMs) tend to outperform other approaches and are considered as the current state of the art for human activity recognition~\cite{9043535}. For classification problems, cross entropy as loss function is used in most works. \extend{???}
\section{Personalization}\label{sec:relWorkPersonalization}
However, it can happen that even well performing architectures yield worse results in real world scenarios. Varying users and environments create many different influences that can affect performance. These could be the position of the device, differences between the sensors or human characteristics \cite{ferrari2020personalization}.
Each user differs in their own movement pattern, so a general detection model may suffer. To overcome this problem, a general model should be trained on a wide range of users and have to cover as much different motion patterns as possible. But this would require unrealistically large data sets. Beside of the storage and processing costs, the availability of public datasets is very limited since labeling is a difficult task. Goal is to to generalize the model as much as possible with respect to the final user.
Each user differs in their own movement pattern, so a general detection model may suffer. To overcome this problem, a general model should be trained on a wide range of users and has to cover as many different motion patterns as possible. But this would require unrealistically large data sets. Beside of the storage and processing costs, the availability of public datasets is very limited since labeling is a difficult task. The goal is to generalize the model as much as possible with respect to the final user.
It has been shown, that a personalized model trained with additional user specific data (even with just a small amount of additional data) can significantly outperform the general model ~\cite{8995531, doi:10.1137/1.9781611973440.71, zhao2011cross}. In my work I concentrate on data-based approaches, which can be split in \textit{subject-independent}, \textit{subject-dependent} and \textit{hybrid} dataset configurations ~\cite{Ferrari2021Sep}. The subject-independent represents the general model where no user specific data is used for training, whereas the user-dependent model just relies on the users data. A user-dependent model would generalize best but requires enough specific data of each user. As the combination of both, the hybrid configuration uses the data of all other users with additional data of the target user. This should result in a better detection of the final users activities than the subject-independent but is easier to train as the subject-dependent since less data of the final user is required. It is even possible, that a hybrid approach can achieve similar performance as the subject-dependent but with less user specific data~\cite{weiss2012impact, Chen2017Mar}.
It has been shown, that a personalized model trained with additional user specific data (even with just a small amount of additional data) can significantly outperform the general model ~\cite{8995531, doi:10.1137/1.9781611973440.71, zhao2011cross}. In my work I concentrate on data-based approaches, which can be split in \textit{subject-independent}, \textit{subject-dependent} and \textit{hybrid} dataset configurations ~\cite{Ferrari2021Sep}. The subject-independent model represents the general model where no user specific data is used for training, whereas the user-dependent model just relies on the users data. A user-dependent model would generalize best but requires enough specific data of each user. As the combination of both, the hybrid configuration uses the data of all other users with additional data of the target user. This should result in a better detection of the final users activities than the subject-independent model but is easier to train as the subject-dependent model since less data of the final user is required. It is even possible, that a hybrid approach can achieve similar performance as the subject-dependent but with less user specific data~\cite{weiss2012impact, Chen2017Mar}.
In a common hybrid approach, a general user-independent model is used first, until new data of an unseen user is gathered. New data is used to fine-tune the existing model. For neural networks, deep transfere learning has been shown to provide a suitable approach to adapt an existing model with additional data~\cite{Tan2018Sep}. Idea is to transfer knowledge from a previously trained model to a new model which solves a similar task. In the case of personalization, transfer learning is used for domain adaption ~\cite{AlHafizKhan2018Mar}. Given a source domain $D_S$ with learning task $T_S$ and a target domain $D_T$ with learning task $T_T$ where $D_S \neq D_T$ and $T_S=T_T$. Goal is to improve the target prediction function $f_T(\cdot)$ from $T_T$ using knowledge from $D_S$ and $T_S$~\cite{Ghafoorian2017Sep, Lebichot2019Apr}. Particularly mini batch optimization is used, where multiple new training instances are collected over time and then used for fine tuning the model by weight updates.
In a common hybrid approach, a general user-independent model is used first, until new data of an unseen user is gathered. New data is used to fine-tune the existing model. For neural networks, deep transfere learning has been shown to provide a suitable approach to adapt an existing model with additional data~\cite{Tan2018Sep}. The idea is to transfer knowledge from a previously trained model to a new model which solves a similar task. In the case of personalization, transfer learning is used for domain adaption ~\cite{AlHafizKhan2018Mar}. Given a source domain $D_S$ with learning task $T_S$ and a target domain $D_T$ with learning task $T_T$ where $D_S \neq D_T$ and $T_S=T_T$. The goal is to improve the target prediction function $f_T(\cdot)$ from $T_T$ using knowledge from $D_S$ and $T_S$~\cite{Ghafoorian2017Sep, Lebichot2019Apr}. Particularly mini batch optimization is used, where multiple new training instances are collected over time and then used for fine tuning the model by weight updates.
To analyze the performance of deep learning personalization as to traditional techniques, Amrani et al. compared deep transfer learning approaches based on CNNs with a baseline incremental learning using Learn++~\cite{Amrani2021Jan}. They demonstrated, that deep learning outperforms Learn++ and does adapt faster to new users.
To analyze the performance of deep learning personalization to traditional techniques, Amrani et al. compared deep transfer learning approaches based on CNNs with a baseline incremental learning using Learn++~\cite{Amrani2021Jan}. They demonstrated, that deep learning outperforms Learn++ and does adapt faster to new users.
Rokni et al. used CNNs and personalization by transfer learning where lower layers are reused and just upper ones are retrained~\cite{Rokni2018Apr}. This is argued by the assumption that learned features in the first layers can be reused in other domains and just classification has to be adapted~\cite{Yosinski2014}. They significantly improved the accuracy of activity recognition with just few labeled instances. Hoelzemann and Van Laerhoven analyzed how results differ respectively to different methods and applications if transfer learning is applied to a Deep Convolutional LSTM network~\cite{Hoelzemann2020Sep}. They suggest, that convolutional layers should not be fine-tuned as already mentioned in the work of Rokni. Furthermore they advice to reinitialize the LSTM layers to default which results in slightly better performance in some cases.
A typical problem which can occur during fine tuning is catastrophic forgetting ~\cite{Lee2017}. Important information which has been trained before gets lost by overfitting to the new target. To overcome this problem Xuhong et al. and Li et al. analyzed different regularization schemes which are applied to inductive transfer learning~\cite{xuhong2018explicit, Li2020Feb}. They state out that L2-SP penalty should be considered as the standard baseline for transfer learning which also overcomes freezing the first layers of a network. Idea is that learned parameters should remain close to their initial values during fine-tuning. So the pre-trained model is a reference which defines the effective search space. To do that a regularizer $\Omega(\omega)$ of the network parameters $\omega$ which have to be adapted, is added to the result of the loss function. For L2-SP penalty, the regularizer is defined as:
A typical problem which can occur during fine tuning is catastrophic forgetting ~\cite{Lee2017}. Important information which has been trained before gets lost by overfitting to the new target. To overcome this problem Xuhong et al. and Li et al. analyzed different regularization schemes which are applied to inductive transfer learning~\cite{xuhong2018explicit, Li2020Feb}. They state that L2-SP penalty should be considered as the standard baseline for transfer learning which also overcomes freezing the first layers of a network. The idea is that learned parameters should remain close to their initial values during fine-tuning. So the pre-trained model is a reference which defines the effective search space. To do that a regularizer $\Omega(\omega)$ of the network parameters $\omega$ which have to be adapted, is added to the result of the loss function. For L2-SP penalty, the regularizer is defined as:
\begin{align}
\Omega(\omega) &= \frac{\alpha}{2}\left\Vert \omega-\omega^0 \right\Vert^2_2
\end{align}
......@@ -47,24 +47,24 @@ Saeedi et al. combine an active learning approach with a neural network consisti
Ashari and Ghasemzadeh observed the limitations of respond capabilities of an user~\cite{Ashari2019Jul}. This applies not only to the number of queries but also to the time discrepant between querying and when it is answered. They add the criteria of a users ability to remember the correct label of a sample to the selection process of queried instances.
Active leaning can be combined with a semi supervised approach as shown by Hasan and Roy-Chowdhury~\cite{Hasan2015Sep}. They use active learning where samples with high tentative prediction probability are labeled by a weak learner, i.e. classification algorithm. Just samples with low certainty and high potential model change are queried to the user. So they can enlarge the training set without increasing the user interaction. They achieved competitive performance as stade-of-the art active learning methods but with a reduced amount of manually labeled instances.
Active learning can be combined with a semi supervised approach as shown by Hasan and Roy-Chowdhury~\cite{Hasan2015Sep}. They use active learning where samples with high tentative prediction probability are labeled by a weak learner, i.e. classification algorithm. Just samples with low certainty and high potential model change are queried to the user. So they can enlarge the training set without increasing the user interaction. They achieved competitive performance as stade-of-the art active learning methods but with a reduced amount of manually labeled instances.
\subsection{Self-supervised learning}\label{sec:relWorkSelfSupervisedLearning}
Here a deep neural network is introduced which learns to solve predefined transformation recognition tasks in an unsupervised manner. I.e. different transformation functions like noise, rotation or negation are applied to an input signal which generates new distinct versions. The network predict the probabilities that a given sequence is the transformation of the original signal. Since the transformation functions are known, a self-supervised labeled training set can be constructed. Idea is, that for detecting the transformation tasks, the core characteristics of the input signal have to be learned. These high level semantics can then be used as the feature layers for the classifier. To train the classification layer just a few labeled samples are required.
Here a deep neural network is introduced which learns to solve predefined transformation recognition tasks in an unsupervised manner. I.e. different transformation functions like noise, rotation or negation are applied to an input signal which generates new distinct versions. The network predicts the probabilities that a given sequence is the transformation of the original signal. Since the transformation functions are known, a self-supervised labeled training set can be constructed. Idea is, that for detecting the transformation tasks, the core characteristics of the input signal have to be learned. These high level semantics can then be used as the feature layers for the classifier. To train the classification layer just a few labeled samples are required.
Saeed et al. showed a self-supervised learning approach for HAR~\cite{Saeed2019Jun}. They achieve a significantly better performance than traditional unsupervised learning methods and comparable with fully-supervised methods. Especially in a semi-supervised scenario where a few labeled instances are available . Tang et al. extend this by a combination of self-supervised learning and self-training~\cite{Tang2021Feb}. A teacher model is trained first using supervised labeled data. The teacher model is used to relabel the supervised dataset and additional unseen instances. Most confident samples are augmented by transformation functions as mentioned previously. After that the self-supervised dataset is used to train a student model. In addition it is fine tuned with the originally supervised instances. By combining the unlabeled data with the limited labeled data, performance can be further enhanced.
Saeed et al. showed a self-supervised learning approach for HAR~\cite{Saeed2019Jun}. They achieve a significantly better performance than traditional unsupervised learning methods and comparable with fully-supervised methods, especially in a semi-supervised scenario where a few labeled instances are available. Tang et al. extend this by a combination of self-supervised learning and self-training~\cite{Tang2021Feb}. A teacher model is trained first using supervised labeled data. The teacher model is used to relabel the supervised dataset and additional unseen instances. Most confident samples are augmented by transformation functions as mentioned previously. After that the self-supervised dataset is used to train a student model. In addition it is fine tuned with the originally supervised instances. By combining the unlabeled data with the limited labeled data, performance can be further enhanced.
\subsection{Partial labels}
In situations where it is not possible to say exactly when an activity was performed, partial labels can be used to indicate for a larger time period which activities are included. Multiple contiguous instances can be collected in a single bag which is labeled by the covered classes. With these partial labels or also called weak labels, the actual classes of the contained instances can be predicted more precisely.
In situations where it is not possible to detect exactly when an activity was performed, partial labels can be used to indicate for a larger time period which activities are included. Multiple contiguous instances can be collected in a single bag which is labeled by the covered classes. With these partial labels or also called weak labels, the actual classes of the contained instances can be predicted more precisely.
In the work of Stikic et al., multi-instance learning with weak labels is used for HAR~\cite{Stikic2011Feb}. A set of instances are collected in a bag which is labeled by a single bag label according to the instances. Therefore a user could simply specify which activities have happened in a certain time period without explicit allocations. The labels of the single instances are then computed by a graph-based label propagation. Hussein et al. use an active learning approach with partial labels for personalized autocomplete teleoperations~\cite{Hussein2021Jul}. For a partial feedback, the user does not have to give the exact label to a query but just answer 'yes or no' questions which covers multiple possible labels. Each feedback excludes a part of classes. The partial label consists of the set of possible classes. In this case the partial feedback is gathered while the user accepts or rejects predicted motions. The adapted framework can reduce false predictions significantly.
In the work of Stikic et al., multi-instance learning with weak labels is used for HAR~\cite{Stikic2011Feb}. A set of instances is collected in a bag which is labeled by a single bag label according to the instances. Therefore a user could simply specify which activities have happened in a certain time period without explicit allocations. The labels of the single instances are then computed by a graph-based label propagation. Hussein et al. use an active learning approach with partial labels for personalized autocomplete teleoperations~\cite{Hussein2021Jul}. For a partial feedback, the user does not have to give the exact label to a query but just answer 'yes or no' questions which covers multiple possible labels. Each feedback excludes a part of classes. The partial label consists of the set of possible classes. In this case the partial feedback is gathered while the user accepts or rejects predicted motions. The adapted framework can reduce false predictions significantly.
%\cite{Pham2017Jan} uses a dynamic programming approach.
\subsection{Pseudo labeling}\label{sec:relWorkPseudoLabeling}
Pseudo labeling allows to do an unsupervised domain adaption by using predictions of the base model~\cite{lee2013pseudo}. Based on the prediction of a sample an artificial pseudo-label is generated which is treated as ground truth data. However this requires that the initial-trained model predicts pseudo-labels with high confidence which is hard to satisfy. Training with false pseudo-labels has a negative impact to the personalization. Moreover it is possible that pseudo-labeling could overfit to incorrect pseudo-labels over multiple iteration which is known as connfirmation-bias. Therefore there are many approaches to augment the pseudo labels to reduce the amount of false training data. Since a base model is required, which is in most cases trained by supervised data, pseudo labeling is a part of semi-supervised learning. But compared to other semi-supervised approaches, pseudo labeling offers a simple implementation which does not rely on domain-specific augmentations or any changes to the model architecture.
Pseudo labeling allows to do an unsupervised domain adaption by using predictions of the base model~\cite{lee2013pseudo}. Based on the prediction of a sample an artificial pseudo-label is generated which is treated as ground truth data. However this requires that the initial-trained model predicts pseudo-labels with high confidence which is hard to satisfy. Training with false pseudo-labels has a negative impact to the personalization. Moreover it is possible that pseudo-labeling could overfit to incorrect pseudo-labels over multiple iteration which is known as confirmation-bias. Therefore there are many approaches to augment the pseudo labels to reduce the amount of false training data. Since a base model is required, which is in most cases trained by supervised data, pseudo labeling is a part of semi-supervised learning. But compared to other semi-supervised approaches, pseudo labeling offers a simple implementation which does not rely on domain-specific augmentations or any changes to the model architecture.
Li at al. showed a naive approach for semi-supervised learning using pseudo labels~\cite{Li2019Sep}. First a pseudo labeling model $M_p$ is trained using a small supervised labeled data set $L$. This model is then used to perform pseudo-labeling for new unlabeled data which results in dataset $\hat{U}$. After that a deep learning model $M_{NN}$ is pre-trained with the pseudo labeled data $\hat{U}$ and afterwards fine-tuned with the supervised data $L$. This process is repeated, where the resulted model $M_{NN}$ is used as new pseudo labeling model $M_p$, until the validation accuracy converges. Moreover they use the fact, that predictions of a classifier model are probabilistic and assume, that labels with higher probability also have a higher accuracy. Therefore, they use only pseudo labels with a high certainty. They argue, that pseudo-labeling can be seen as a kind of data augmentation. Even with high label noise of the pseudo labels, a deep neural network should be able to improve with training. In their tests they achieved significant improvement in accuracy by adding pseudo labels to the training. Furthermore they showed that the model benefits especially from the first iterations. Nevertheless it is required, that the pseudo labeling model $M_P$ has a certain accuracy. Tests show that a better pseudo labeling model leads to a higher accuracy of the fine-tuned model. Arazo et al. observed performance of a naive pseudo labeling applied on images and showed, that it would overfit to incorrect pseudo labels~\cite{Arazo2020Jul}. The trained model tend to have higher confidence to previously false predicted labels, which results in new incorrect predictions. They applied simple modifications to prevent confirmation bias without requiring multiple networks or any consistency regularization methods as done in other approaches like in \secref{sec:relWorkSelfSupervisedLearning}. With the use of mixed up augmentation as regularization and adding a minimum number of labeled samples, they yielded state-of-the-art performance. Additionally they use soft-labels instead of hard-labels for training. Here a label consists of the individual class affiliations instead of a single value for the target class. Thereby it is possible to depict uncertainty over the classes. As mixed up strategy, they combine random sample pairs and corresponding labels, which creates a data augmentation with label smoothing. This should reduce the confidence of network predictions. As they point out, this approach is simpler than using other regularization methods and moreover more accurate.
......@@ -74,14 +74,14 @@ Rizve et al. \cite{Rizve2021Jan} tackles the problem of relatively poor performa
%\cite{Gonzalez2018May} compared different self-labeling methods in a time series context. In self-training, a base learner is firstly trained on a labeled set. Then, unlabeled instances are classified by the base classifier, where it is assumed, that more accurate predictions tend to be correct. After that the labeled training set is enlarged with these self-labeled instances. They achieved for the best performing methods similar performance to the supervised learning.
\section{Learning with label noise}\label{sec:relWorkLabelNoise}
When learning with generated labels it can happen, that some of these labels consists of a wrong annotation. So called label noise has a negative impact to the training process. Depending on the type of noise the influence on the model can differ. Where \textit{uniform noise} and \textit{class-dependent noise} can still achieve good model accuracy up to a certain degree, \textit{feature-dependent noise} results in much worse performance~\cite{Algan2020Mar}. Since in synthetic annotation processes, the labels are determined by the data, wrong annotations leads to feature-dependent noise. So it is important to handle noisy labels during training.
When learning with generated labels, it may happen that some of these labels consists of a wrong annotation, so called label noise. This has a negative impact to the training process. Depending on the type of noise the influence on the model can differ. Where \textit{uniform noise} and \textit{class-dependent noise} can still achieve good model accuracy up to a certain degree, \textit{feature-dependent noise} results in much worse performance~\cite{Algan2020Mar}. In synthetic annotation processes, the labels are determined by the data, wrong annotations leads to feature-dependent noise. So it is important to handle noisy labels during training.
But Label noise does not only occur in generated labels. Also in supervised-learning it is possible, that there exist mislabeled instances~\cite{Frenay2013Dec}. This could happen due to small boundaries between classes which are hard or even impossible to distinguish or errors in human annotations. Therefore there is multiple research to train neural networks with noisy labels. One approache would be either, to try to adapt the learning process itself to get robust against noise~\cite{Patrini2017, Ghosh2017Feb} or to clean the noise in the training data set. In my work, I focus on the latter, as no changes have to be made to the model architecture or the training implementation. Furthermore it is also possible for standard neural networks to learn from arbitrary noisy data and still perform well. They also benefits from larger data sets which can accommodate a wide range of noise~\cite{Rolnick2017May}.
Label noise does not only occur in generated labels. Additionally in supervised-learning it is possible, that there exist mislabeled instances~\cite{Frenay2013Dec}. This can happen due to small boundaries between classes which are hard or even impossible to distinguish or errors in human annotations. Therefore, there are numerous research works on training neural networks with noisy labels. One approache would be either, to try to adapt the learning process itself to get robust against noise~\cite{Patrini2017, Ghosh2017Feb} or to clean the noise in the training data set. In my work, I focus on the latter, as no changes have to be made to the model architecture or the training implementation.
In particular, there is a lot of research in the field of image denoising, where deep neural networks have also become very promising~\cite{Xie2012, Dong2018Oct, Gondara2016Dec}. But a lot of these approaches can also be applied to time series data. Since in our case labels results from time series samples, the labels itself can be seen as time series data too. Therefore it is likely, that two adjacent labels have the same value and an outlier would be probably a false label. This is similar to noise in a signal and denoising or smoothing would be the same as correcting the wrong labels.
Furthermore it is also possible for standard neural networks to learn from arbitrary noisy data and still perform well. They also benefits from larger data sets which can accommodate a wide range of noise~\cite{Rolnick2017May}. In particular, there is a lot of research in the field of image denoising, where deep neural networks have also become very promising~\cite{Xie2012, Dong2018Oct, Gondara2016Dec}. But a lot of these approaches can also be applied to time series data. Since in our case labels results from time series samples, the labels themselves can also be considered as time series data. Therefore it is likely, that two adjacent labels have the same value and an outlier would probably be a false label. This is similar to noise in a signal and denoising or smoothing would be the same as correcting the wrong labels.
\subsection{Autoencoder}\label{sec:relWorkAutoencoder}
For denoising signals especially autoencoders have shown to achieve good results. An autoencoder is a neural network model which consists of two parts, the encoder and the decoder. In encoding, the model tries to map the input into a more compact form with low-dimensional features. During decoding this compressed data is reconstructed to the original space~\cite{Liu2019Feb}. During training the model is adjusted to minimize the reconstruction error between the predicted output and expected output. Autoencoders have become popular for unsupervised pretraining of a deep neural network. Vincent et al. introduced the use of autoencoders for denoising~\cite{vincent2010stacked}. A denoising autoencoder receives corrupted input samples and is trained with their clean values to learn the prediction of the original data. So not just a mapping from input to output is learned, but also features for denoising.
For denoising signals, autoencoders have shown to achieve good results~\cite{Xiong2016Nov, Chiang2019Apr, Xiong2016Jun}. An autoencoder is a neural network model which consists of two parts, the encoder and the decoder. In encoding, the model tries to map the input into a more compact form with low-dimensional features. During decoding this compressed data is reconstructed to the original space~\cite{Liu2019Feb}. During training the model is adjusted to minimize the reconstruction error between the predicted output and expected output. Autoencoders have become popular for unsupervised pretraining of a deep neural network. Vincent et al. introduced the use of autoencoders for denoising~\cite{vincent2010stacked}. A denoising autoencoder receives corrupted input samples and is trained with their clean values to learn the prediction of the original data. So not just a mapping from input to output is learned, but also features for denoising.
A traditional autoencoder consists of three fully connected (FC) layers, where the first layer is used for encoding and the last layer for decoding. The middle layer represents the hidden state of the compact data. Chiang et al. use a fully convolutional network as their architecture for denoising electrocardiogram signals~\cite{Chiang2019Apr}. The network consists of multiple convolutional layers for the encoding part and an inversely symmetric encoder using deconvolutional layers. An additional convolutional layer is used for the output. They use a stride of 2 in their convolutional layers to downsample the input signal and upsample it again to the output. So no pooling layers are required and the exact signal alignment of input and output is obtained. This results in a compression from a 1024x1 dimensional input signal to a 32x1 dimensional feature map. Since no fully connected layers are used, the number of weight parameters is reduced and the locally-spatial information is preserved. As loss function they use root mean square error which determines the variance between a predicted output and the original signal. Similar Garc\'{i}a-P\'{e}rez et al. use fully-convolutional denoising auto-encoder (FCN-dAE) architecture for Non-Intrusive Load Monitoring~\cite{Garcia-Perez2020Dec}. Both had shown, that a FCN-dAE outperforms a traditional autoencoder in terms of improving the noise to signal ratio.
......
......@@ -8,18 +8,21 @@ Finally, I present an active learning implementation, which is used for performa
\section{Base Application}\label{sec:approachBaseApplication}
The application I wanted to personalize, detects hand washing activities and is executed on a smart watch. It is used to observe obsessive behavior of a participant to treat OCD. If the device detects a hand wash activity, a notification is prompted to the user, which can then confirmed or declined. A confirmation leads to a survey, where the user can rate its mental condition. Furthermore a user can trigger manual evaluations if a hand washing was not detected by the device. This evaluations can be used later by psychologists to analyze and treat the participants state during the day. For activity prediction the application uses a general neural network model based on the work of Robin Burchard \cite{robin2021}.
The system I build on, consist of an Android Wear OS application, which is executed on a smart watch and a HTTP-Webservice. It is used to observe obsessive behavior of a participant to treat OCD. Therefore all wrist movements are recorded and surveys to the users mental condition for each hand washing are collected. If the device detects a hand wash activity, a notification is prompted to the user, which can then confirmed or declined. A confirmation leads to the evaluation process of the user. Furthermore a user can trigger manual evaluations if a hand washing was not detected by the device. This evaluations can be used later by psychologists to analyze and treat the participants state during the day. \figref{fig:baseApplicationScreen} shows screen shots of the application.
\input{figures/approach/base_application_screen}
The integrated IMU of the smart watch is used to record wrist movements and stores the sensor data in a buffer. After a cycle of 10 seconds the stored data is used to predict the current activity. Therefore a sliding window with length of 3 seconds and a window shift of 1.5 seconds is applied to the buffer. For each window the largest distance between the sensor values is calculated to filter out sections where there is just little movement. If there is some motion, the general recognition model is applied to the windows of this section, to predict the current activity label. To avoid detection based on outliers, a running mean is computed over the last $kw$ predictions. Just if it exceeds a certain threshold $kt$ the final detection is triggered. Additionally the buffer is saved to an overall recording in the internal storage. While charging the smart watch, all sensor recordings and user evaluations are sent to an HTTP server.
For activity prediction the application uses a general neural network model based on the work of Robin Burchard \cite{robin2021}. The integrated IMU of the smart watch is used to record wrist movements and stores the sensor data in a buffer. After a cycle of 10 seconds the stored data is used to predict the current activity. Therefore a sliding window with length of 3 seconds and a window shift of 1.5 seconds is applied to the buffer. For each window the largest distance between the sensor values is calculated to filter out sections where there is just little movement. If there is some motion, the general recognition model is applied to the windows of this section, to predict the current activity label. To avoid detection based on outliers, a running mean is computed over the last $kw$ predictions. Just if it exceeds a certain threshold $kt$ the final detection is triggered. Additionally the buffer is saved to an overall recording in the internal storage.
While charging the smart watch, all sensor recordings and user evaluations are sent to the web server. There they are collected and assigned to the respective watches using the android id. The server also provides a web interface for accessing and managing all recording sets and participants. In addition, various statistics are generated for the gathered data.
\section{Datasets}
To personalize a human activity recognition model, it must be re-trained with additional sensor data from a particular user. In our case this data have to be from IMUs of wrist-worn devices during various human activities. Typically they consist of a set $S=\{S_0,\dots,S_{k-1}\}$ of $k$ time series. Each $S_i\in \mathbb{R}^{d_i}$ is a sensor measured attribute with dimensionality $d_i$ of sensor $i$. Additionally there is a set of $n$ activity labels $A=\{a_0, \dots, a_{n-1}\}$ and each $S_i$ is assigned to one of them ~\cite{Lara2012Nov}. For activity prediction I use a sliding window approach where I split the data set into $m$ time windows $W=\{W_0, \dots, W_{m-1}\}$ of equal size $l$ and with a shift of $v$. Each window $W_i$ contains a sub-set of time series $W_i=\{S_{i\cdot v}, \dots, S_{i\cdot v + l}\}$ and is assigned to an activity label which builds the set of crisp labels $Y=\{y_0, \dots, y_{m-1}\}$.
To personalize a human activity recognition model, it must be re-trained with additional sensor data from a particular user. In our case this data have to be from IMUs of wrist-worn devices during various human activities. Typically they consist of a set $S=\{S_0,\dots,S_{k-1}\}$ of $k$ time series. Each $S_i\in \mathbb{R}^{d_i}$ is a sensor measured attribute with dimensionality $d_i$ of sensor $i$. Additionally there is a set of $n$ activity labels $A=\{a_0, \dots, a_{n-1}\}$ and each $S_i$ is assigned to one of them ~\cite{Lara2012Nov}. For activity prediction I use a sliding window approach where I split the data set into $m$ time windows $W=\{W_0, \dots, W_{m-1}\}$ of equal size $l$. The windows are shifted by $v$ time series, which means that they overlap when $v < l$. Each window $W_i$ contains a sub-set of time series $W_i=\{S_{i\cdot v}, \dots, S_{i\cdot v + l}\}$ and is assigned to an activity label which builds the set of labels $Y=\{y_0, \dots, y_{m-1}\}$.
Most of today wearable devices consists of acceleration and gyroscope with three dimensions each. I combine the sets $S_{acceleration}$ and $S_{gyroscope}$ into one set with $S_i\in \mathbb{R}^{d_{acceleration}+d_{gyroscope}}$. In the case of hand wash detection I use the activity labels $A=\{null, hw\}$ where \textit{null} represents all activities where no hand washing is covered and \textit{hw} represents all hand washing activities.
\subsection{Synthetic data sets}\label{sec:synDataset}
There are several published data sets containing sensor data of wearable devices during various human activities. Since most public data sets are separated in individual parts for each activity, artificial data sets have to be created which consist of a continuous sequence of activities. There should be a reasonable alternating between \textit{null} and \textit{hw} like in a real world scenario.
There are several published data sets containing sensor data of wearable devices during various human activities. Since most public data sets are separated in individual parts for each activity, artificial data sets have to be created which consist of a continuous sequence of activities. There should be a reasonable constellation between \textit{null} and \textit{hw} samples, such they build larger parts of non hand washing activities with short hand washing parts in between, like in a real world scenario.
Furthermore additional data for user feedback which covers parts of the time series is required. We can use the general prediction model to determine hand wash parts as it would be in the base application. In our case we apply a running mean over multiple windows to the predictions and trigger an indicator $e_i$ at window $W_j$ if it's higher than a certain threshold. This indicator $e_i$ gets the value \textit{correct} if one of the ground truth data covered by the mean has activity label \textit{hw}, otherwise it is \textit{false}. This represents the user feedback to confirmed or declined evaluations. For hand wash sequences where no indicator has been triggered, a manual user feedback indicator is added.
......@@ -31,19 +34,18 @@ Since adjacent windows tend to have the same activity, one indicator can cover s
\subsubsection{Used data sets}
For this work I used data sets from Unversty of Basel and University of Freiburg [REF]. These include hand washing data which was recorded using a smart watch application. Additionally they contain long term recordings with every day activities. The data is structured by individual participants and multiple activities per recording. During the generation of a synthetic data set, the data of a single participant is selected randomly. To cover enough data, I had to combine the data sets and merge single participants over each data set. Therefore a resulting data set for a user contains multiple participants which I treat as one. This just affects data for \textit{null} activities. All hand wash activity data is from the same user. Since the same data sets have already been used to train the base model, I had to retrain individual base models for each participant where no of its own data is contained.
\extend{specification of resulting dataset}
\subsection{Recorded data sets}
The base application records and transfers all sensor data and evaluations of daily usage to a web server. Therefore I could use the application to collect additional training sets for my thesis. However, certain analysis require ground truth data of the activity labels, which are not included in the recordings. I have added markers to depict the exact beginning and end of a hand wash action, so the ground truth data can be derived. These markers are set by hand and are not part of the final personalization process. This allows the recorded data to be processed into data sets such the synthetic ones described in \secref{sec:synDataset}. Just the indicators are not generated but used from the real user feedback.
The base application records and transfers all sensor data and evaluations of daily usage to a web server. Therefore I could use the application to collect additional training sets for my thesis. However, certain analysis require ground truth data of the activity labels, which are not included in the recordings. I have added markers to depict the exact beginning and end of a hand wash action, so the ground truth data can be derived. These markers are set by hand and are not part of the final personalization process. The generated datasets have the same format as the synthetic datasets like described in \secref{sec:synDataset}. Just the indicators are not generated but used from the real user feedback.
\section{Personalization}\label{sec:approachPersonalization}
In this work, I employ a personalization approach which does not require additional user interaction and can be added to the existing application without changes in the models architecture. So the base application can still work by itself and the user does not notice any changes in the usage.\\
In this work, I employ a personalization approach which does not require additional user interaction and can be added to the existing application without changes in the model's architecture. So the base application can still work by itself and the user does not notice any changes in the usage.\\
Requirements for this process are:
\begin{itemize}
\item[1)]Collection of all sensor data\\ Since an application has to listen to the sensor data anyway for predictions, it should be possible to additionally save them in the internal storage. Furthermore they have to be provided to the personalization service in some way. In my case, the sensor recordings are transmitted to a central point anyways, so it is easy to access them.
\item[2)]User responses to predictions\\ These can be simple yes/no feedback. I assume that there is at least some kind of confirmation when an activity is detected, such that the application can perform its task. For example, the application in this work is used to query an evaluation from the user if a hand wash activity is performed. Therefore, I can deduce that the recognition was correct, should such an evaluation have taken place. If the user declines, I know the prediction was incorrect. In cases where there is neither a confirmation or rejection it is possible to ignore this part or treat it like a false prediction. In section ??? I show, that especially the confirmation to predictions has an impact to the personalization performance. Therefore, the no-feedback could be neglected if it is not possible to implement or it would change the usage.
\item[2)]User responses to predictions\\ These can be simple yes/no feedback. I assume that there is at least some kind of confirmation when an activity is detected, such that the application can perform its task. For example, the application in this work is used to query an evaluation from the user if a hand wash activity is performed. Therefore, I can deduce that the recognition was correct, should such an evaluation have taken place. If the user declines, I know the prediction was incorrect. In cases where there is neither a confirmation or rejection it is possible to ignore this part or treat it like a false prediction. In section \secref{sec:expMissingFeedback} I show, that especially the confirmation to predictions has an impact to the personalization performance. Therefore, the no-feedback could be neglected if it is not possible to implement or it would change the usage.
\end{itemize}
......@@ -53,14 +55,14 @@ The neural network model I want to personalize has been implemented and trained
To personalize the model I use transfer learning in a domain adaption manner as described in \secref{sec:relWorkPersonalization}. This requires additional labeled training data. Due to condition $1)$ all sensor data from a user is available, however without any labels. Therefore I generate pseudo labels based on the predictions of the general activity recognition model. Additionally these labels are refined as described in the following \secref{sec:approachLabeling}. The use of pseudo labels leads to a supervised training. This allows to let the model architecture unchanged and main parts of the original training implementation and hyper parameter settings can be reused which have been elaborated by Robin Burchard~\cite{robin2021}.
\subsubsection{Regularization}\label{sec:approachRegularization}
To avoid over-fitting to the target during multiple iterations of personalization I try two different approaches. The first approach is to freeze the feature layers of the model. As shown by Yosinski et al. feature layers tend to be more generalizeable and can better transferred to the new domain~\cite{Yosinski2014}. Therefore the personalization is just applied to the classification layer. So less parameters have to be fine tuned, what results in less computation time and a smaller amount of training data can have significant impact to the model. In the second approach I apply L2-SP penalty to the optimization as mentioned by Xuhong et al.~\cite{xuhong2018explicit}. Here the regularization restricts the search space to the initial model parameters. Therefore information which is learned in the pre-training stays existent even over multiple fine tuning iterations. This allows to adjust all parameters which offers more flexibility in fine-tuning. To test which approach fits best, I compare them in \secref{??}.
To avoid over-fitting to the target during multiple iterations of personalization I try two different approaches. The first approach is to freeze the feature layers of the model. As shown by Yosinski et al. feature layers tend to be more generalizeable and can better transferred to the new domain~\cite{Yosinski2014}. Therefore the personalization is just applied to the classification layer. So less parameters have to be fine tuned, what results in less computation time and a smaller amount of training data can have significant impact to the model. In the second approach I apply L2-SP penalty to the optimization as mentioned by Xuhong et al.~\cite{xuhong2018explicit}. Here the regularization restricts the search space to the initial model parameters. Therefore information which is learned in the pre-training stays existent even over multiple fine tuning iterations. This allows to adjust all parameters which offers more flexibility in fine-tuning. To test which approach fits best, I compare them in \secref{sec:expEvolCompFilter}.
\section{Training data labeling}\label{sec:approachLabeling}
To retrain the base model with new sensor data, I use a semi-supervised approach using pseudo labels as described in \secref{sec:relWorkPseudoLabeling}. Since there is already a trained model $M_p$, I can make use of its supervision from pre-training to generate artificial labels $\hat{Y}=\{\hat{y}_0, \dots, \hat{y}_{m-1}\}$ based on predictions from new unlabeled sensor data $W=\{W_0, \dots, W_{m-1}\}$, where $\hat{y}_i=M_p(W_i)$. Therefore it is easy to get pseudo labels for all new sensor data. This process does not require any supervision, but allows the model to be trained in the same way as with a supervised method. So no changes ate the models architecture or based training implementation are required.
However, it is very likely that there are some wrong predictions that negatively affect the training, which is also called label noise. It is important to keep the amount of noise as low as possible. To observe the impact of wrong classified data I did several tests, which you can see in \secref{????}. As seen in \secref{sec:relWorkLabelNoise} there are multiple approaches to make the training more robust against label noise. I use the predicted values as soft-labels to depict uncertainty of the model. So the set of pseudo labels consists of vectors instead of crisp labels, $\hat{Y}=\{\hat{\bm{y}}_0, \dots, \hat{\bm{y}}_{m-1}\}$ where each $\hat{\bm{y}}_i = \begin{bmatrix}\hat{y}_i^{null}& \hat{y}_i^{hw}\end{bmatrix}$. The value $\hat{y}_i^{null}$ is the predicted membership for class \textit{null} and $\hat{y}_i^{hw}$ for class \textit{hw}. \figref{fig:examplePseudoSataset} shows an example plot of the predicted pseudo values of the previously seen dataset. In the following I call a pseudo label $\hat{\bm{y}}_i$ \textit{null} if $\hat{y}_i^{null} > \hat{y}_i^{hw}$ and \textit{hw} or hand wash if $\hat{y}_i^{null} < \hat{y}_i^{hw}$.
To retrain the base model with new sensor data, I use a semi-supervised approach using pseudo labels as described in \secref{sec:relWorkPseudoLabeling}. Since there is already a trained model $M_p$, I can make use of its supervision from pre-training to generate artificial labels $\hat{Y}=\{\hat{y}_0, \dots, \hat{y}_{m-1}\}$ based on predictions from new unlabeled sensor data $W=\{W_0, \dots, W_{m-1}\}$, where $\hat{y}_i=M_p(W_i)$. Therefore it is easy to get pseudo labels for all new sensor data. This process does not require any supervision, but allows the model to be trained in the same way as with a supervised method. So no changes at the models architecture or based training implementation are required.
However, it is very likely that there are some wrong predictions that negatively affect the training, which is also called label noise. It is important to keep the amount of noise as low as possible. To observe the impact of wrong classified data I did several tests, which you can see in \secref{sec:expTransferLearningNoise}. As seen in \secref{sec:relWorkLabelNoise} there are multiple approaches to make the training more robust against label noise. I use the predicted values as soft-labels to depict uncertainty of the model. So the set of pseudo labels consists of vectors instead of crisp labels, $\hat{Y}=\{\hat{\bm{y}}_0, \dots, \hat{\bm{y}}_{m-1}\}$ where each $\hat{\bm{y}}_i = \begin{bmatrix}\hat{y}_i^{null}& \hat{y}_i^{hw}\end{bmatrix}$. The value $\hat{y}_i^{null}$ is the predicted membership for class \textit{null} and $\hat{y}_i^{hw}$ for class \textit{hw}. \figref{fig:examplePseudoSataset} shows an example plot of the predicted pseudo values of the previously seen dataset. In the following I call a pseudo label $\hat{\bm{y}}_i$ \textit{null} if $\hat{y}_i^{null} > \hat{y}_i^{hw}$ and \textit{hw} or hand wash if $\hat{y}_i^{null} < \hat{y}_i^{hw}$.
\input{figures/approach/example_pseudo_labels}
......@@ -68,7 +70,7 @@ For further improvements, I rely on knowledge about the context of hand washing.
\subsection{User Feedback}
For each section where the running mean of model predictions reaches a certain threshold, an indicator is created which is either \textit{neutral}, \textit{false} or \textit{correct}. More over there can be indicators of type \textit{manual}. These provide following information about the the respective predictions.
For each section where the running mean of model predictions reaches a certain threshold, an indicator is created which is either \textit{neutral}, \textit{false} or \textit{correct}. More over there can be indicators of type \textit{manual}. These provide following information about the respective predictions.
\begin{itemize}
\item neutral:\\ The participant has not answered this query. But if there is another indicator immediately afterwards, both will probably cover the same activity. So we can assume the same value as from the following indicator. If this is also of value \textit{neutral}, we continue this assumption over all following ones until either an indicator with the value \textit{false}/\textit{correct} exists or the distance between two adjacent indicators is so large that we can no longer assume the same activity. In the second case, no precise statement exists about the activity or prediction.
......@@ -88,19 +90,19 @@ In the case of hand washing I set for a \textit{false} indicator $i$ at window $
\end{align*}
The set $\mathds{I}^{positive}$ is the combination of the correct and manual sets and depict all \textit{positive} intervals which should contain a hand wash sequence.
But we can not be sure, that a user has always answered the queries or triggered a manual indicator for all unrecognized activities. In section ?? I observe the performance impact of incomplete user feedback. Moreover I try to make the learning process robust against missing feedback. Since the indicators are created during the usage of the application I assume, that no incorrect feedback is made, because it would also lead to a worse user experience.
But we can not be sure, that a user has always answered the queries or triggered a manual indicator for all unrecognized activities. In section \secref{sec:expMissingFeedback} I observe the performance impact of incomplete user feedback. Moreover I try to make the learning process robust against missing feedback. Since the indicators are created during the usage of the application I assume, that no incorrect feedback is made, because it would also lead to a worse user experience. For example, if the user rejects correctly detected hand washing activities, no evaluation can be performed, which is contrary to the purpose of the application.
\input{figures/approach/example_dataset_feedback}
\subsection{Denoising}
In this section I describe different approaches how the indicators can be used to refine the pseudo labels.
The pseudo-labels are derived from the predictions of the general model. Therefore, we cannot be sure that they are correct. The indicators offer a source of ground truth information to the underlying activity. In this section I describe different approaches how the indicators can be used to refine the pseudo-labels.
In the first step we can use the raw information of the indicators. As said during a \textit{false} interval, we know, that the hand wash predictions are false. So we can set all predictions within the interval to \textit{null}, i.e. the soft label vectors to $\hat{\bm{y}}_i = \begin{bmatrix}1 & 0\end{bmatrix}$. For neutral intervals it is difficult to make a statement. For example it could be possible, that a user tends more to confirm all right predicted activities than decline false predictions, because a confirmation leads to the intended usage of the application. So we can assume that all correctly detected activities are confirmed, if not they are false predictions. In this case all labels in \textit{neutral} intervals can be set to \textit{null}. Otherwise, we cannot make a safe statement about the correctness of the prediction, but simply exclude these sections from training. Another approach would be to just use data which is covered by a \textit{false} or \textit{correct} indicator. With correcting the false labels and the knowledge that confirmed predicted labels are probably correct, there is a high certainty in the training data. But labels within \textit{positive} intervals can still contain outlier predictions which are false. As shown in section ?? as \textit{null} labeled \textit{hw} samples does not have as much negative impact to the performance as \textit{hw} labeled \textit{null} samples. Therefore it is crucial to be sure, that no pseudo label of a \textit{null} sample inside a \textit{positive} interval has a high value for hand washing. We know, that inside the interval there should be just a set of adjacent samples with label \textit{hw}. All others should be of type \textit{null}. In the following I concentrate in approaches to identify exacly these subset and correct all labels accordingly inside a \textit{positive} interval.
In the first step we can use the raw information of the indicators. As said during a \textit{false} interval, we know, that the hand wash predictions are false. So we can set all predictions within the interval to \textit{null}, i.e. the soft label vectors to $\hat{\bm{y}}_i = \begin{bmatrix}1 & 0\end{bmatrix}$. For neutral intervals it is difficult to make a statement. For example it could be possible, that a user tends more to confirm all right predicted activities than decline false predictions, because a confirmation leads to the intended usage of the application. So we can assume that all correctly detected activities are confirmed, if not they are false predictions. In this case all labels in \textit{neutral} intervals can be set to \textit{null}. Otherwise, we cannot make a safe statement about the correctness of the prediction, but simply exclude these sections from training. Another approach would be to just use data which is covered by a \textit{false} or \textit{correct} indicator. With correcting the false labels and the knowledge that confirmed predicted labels are probably correct, there is a high certainty in the training data. But labels within \textit{positive} intervals can still contain false predictions. As shown in section \secref{sec:expTransferLearningNoise} as \textit{null} labeled \textit{hw} samples does not have as much negative impact to the performance as \textit{hw} labeled \textit{null} samples. Therefore it is crucial to be sure, that no pseudo label of a \textit{null} sample inside a \textit{positive} interval has a high value for hand washing. We know, that inside the interval there should be just a set of adjacent samples with label \textit{hw}. All others should be of type \textit{null}. In the following I concentrate in approaches to identify exacly these subset and correct all labels accordingly inside a \textit{positive} interval.
\subsubsection{Naive}
In a naive approach we search for the largest bunch of neighbored pseudo-labels which have a high value for hand washing. This is done by computing a score over all subsets of adjacent labels. The score of a subset $Sub_k=\{\hat{\bm{y}}_p, \dots, \hat{\bm{y}}_q\}$ is computed by:
In a naive approach we search for the largest bunch of neighboring pseudo-labels which have a high value for hand washing. This is done by computing a score over all subsets of adjacent labels. The score of a subset $Sub_k=\{\hat{\bm{y}}_p, \dots, \hat{\bm{y}}_q\}$ is computed by:
\begin{align}
Score_k&=\sum_{\hat{\bm{y}}_i\in Sub_k}(-\delta+(\hat{y}^{hw}_i - 0.5))
\end{align}
......@@ -112,7 +114,7 @@ The score just benefits from adding a label to the set if the predicted value fo
\subsubsection{Deep Convolutional network}
Convolutional networks have become a popular method for image and signal classification. I use a convolutional neural network (CNN) to predict the value of a pseudo label given the surrounding pseudo labels. It consist of two 1d-convolutional layers and a linear layer for the classification output. Both convolutional layers have a stride of 1 and padding is applied. The kernel size of the first layer is 10 and from the second 5. They convolve along the time axis over the \textit{null} and \textit{hw} values. As activation function I use the Rectified Linear Unit (ReLU) after each convolutional layer. For input I apply a sliding window of length 20 and shift 1 over the pseudo labels inside a \textit{hw} interval. This results in a 20x2 dimensional input for the network which generates a 1x2 output. After applying a sotmax function, the output is the new pseudo soft-label at the windows middle position.
To train the network I used the approach from \secref{sec:synDataset} and created multiple synthetic datasets. On these datasets I predicted the pseudo labels by the base model. Additionally I augmented the labels by adding noise and random label flips. After that I extracted the \textit{correct} intervals. This results in roughly 400 intervals with $\sim1300$ windows which were shuffled before training. As loss function I used the cross-entropy. In \figref{fig:examplePseudoFilterCNN} you can see a plot of the same example intervals as before, but where the pseudo labels are refined by the CNN approach. As you can see this approach can make use of the soft-labels and smooths out the activity boundaries. So imprecise boundaries does have less impact to the training process. But it is also possible, that a local bunch of false predicted values from the base model leads to incorrect pseudo labels.
To train the network I used the approach from \secref{sec:synDataset} and created multiple synthetic datasets. On these datasets I predicted the pseudo labels by the base model. Additionally I augmented the labels by adding noise and random label flips. After that I extracted the \textit{correct} intervals. This results in roughly 400 intervals with $\sim1300$ windows which were shuffled before training. As loss function I used the cross-entropy. In \figref{fig:examplePseudoFilterCNN} you can see a plot of the same example intervals as before, but where the pseudo labels are refined by the CNN approach. As you can see this approach can make use of the soft-labels and smooths out the activity boundaries. So imprecise boundaries does have less impact to the training process. But it is also possible, that a local bunch of wrongly predicted values from the base model leads to incorrect pseudo labels.
\input{figures/approach/example_pseudo_filter_cnn}
......@@ -132,7 +134,7 @@ The architecture of convLSTM1-dAE uses two convLSTM layers for encoding and deco
\input{figures/approach/example_pseudo_filter_lstmdae}
\subsection{Filter configurations}\label{sec:approachFilterConfigurations}
During testing I have created multiple filter configurations which consists of different constellations of the introduced denoising approaches. These configurations can be seen in \tabref{tab:filterConfigurations}. Some of them rely on ground truth data and are just used for evaluation. The configurations \texttt{all, high\_conf, scope, all\_corrected\_null, scope\_corrected\_null, all\_corrected\_null\_hwgt} and \texttt{scope\_corrected\_null\_hwgt} depict base lines to observe which impact different parts could have to the training. So \texttt{all} and \texttt{high\_conf} show simple approaches where no user feedback is considered. Configuration \texttt{scope} depict the difference between including more data which could potentially be wrong and just using data where additional information is available. To show the improvements by simply correcting false predictions, \texttt{all\_corrected\_null, scope\_corrected\_null} is used. This is extended by theoretical evaluations of \texttt{all\_corrected\_null\_hwgt, scope\_corrected\_null\_hwgt} which states an upper bound of a possible perfect filter approach for \textit{hw, manual} intervals. The \texttt{all\_null\_*} configurations rely on the context knowledge, that the hand wash parts should be way less than all other activities. So we can assume, that all labels should be of value \textit{null} and just inside \textit{hw, manual} intervals there are some of value \textit{hw}. This depends on how reliably a user has specified all hand wash actions. Here especially the performance of the introduced denoising approaches is focused. Again \texttt{all\_null\_hwgt} represents the theoretical upper bound if a perfect filter would exits. As a more general approach, the \texttt{all\_cnn\_*} configurations do not make such a hard contextual statements and attempt to combine the cleaning abilities of the CNN network and high confidence in the resulting labels, to augment the training data with likely correct samples.
During testing I have created multiple filter configurations which consists of different constellations of the introduced denoising approaches. These configurations and their detailed descriptions can be seen in \tabref{tab:filterConfigurations}. Some of them rely on ground truth data and are just used for evaluation. The configurations \texttt{all, high\_conf, scope, all\_corrected\_null, scope\_corrected\_null, all\_corrected\_null\_hwgt} and \texttt{scope\_corrected\_null\_hwgt} depict base lines to observe which impact different parts could have to the training. So \texttt{all} and \texttt{high\_conf} show simple approaches where no user feedback is considered. Configuration \texttt{scope} depict the difference between including more data which could potentially be wrong and just using data where additional information is available. To show the improvements by simply correcting false predictions, \texttt{all\_corrected\_null, scope\_corrected\_null} is used. This is extended by theoretical evaluations of \texttt{all\_corrected\_null\_hwgt, scope\_corrected\_null\_hwgt} which states an upper bound of a possible perfect filter approach for \textit{hw, manual} intervals. The \texttt{all\_null\_*} configurations rely on the context knowledge, that the hand wash parts should be way less than all other activities. So we can assume, that all labels should be of value \textit{null} and just inside \textit{hw, manual} intervals there are some of value \textit{hw}. This depends on how reliably a user has specified all hand wash actions. Here especially the performance of the introduced denoising approaches is focused. Again \texttt{all\_null\_hwgt} represents the theoretical upper bound if a perfect filter would exits. As a more general approach, the \texttt{all\_cnn\_*} configurations do not make such a hard contextual statements and attempt to combine the cleaning abilities of the CNN network and high confidence in the resulting labels, to augment the training data with likely correct samples.
\input{figures/approach/table_filter_configurations}
......
......@@ -83,7 +83,7 @@ First we observe how the model performance evolves over the iteration steps. \fi
\input{figures/experiments/supervised_evolution_single}
\subsubsection{Comparison of filter configurations}
\subsubsection{Comparison of filter configurations}\label{sec:expEvolCompFilter}
In this step I compare the evaluation of the personalized model over the different filter configurations. Additionally I apply different number of epochs and split the regularization methods. The models are trained with 50, 100, and 150 epochs. \figref{fig:evolutionAll} shows their S scores. In (a) freezing the feature layers and for (b) in l2-sp penalty is used for regularization. The personalizations trained with freezed layers show all a similar increasing trend in performance. With more epochs they seem to achieve higher S values. Especially in the first iteration, all personalized models trained with 150 epochs already outperformed the general model. With l2-sp regularization the performance varies heavily. For each selection of epochs the personalization lead to different results. It could be possible, that a better performing model is trained, but no exact statement can be made.
\input{figures/experiments/supervised_evolution_all}
......
\begin{figure}[t]
\begin{centering}
\subfloat[manual hand washing]
{\includegraphics[width=0.28\textwidth]{figures/approach/base_application_screen_manual.png}}
\qquad
\subfloat[detected hand washing]
{\includegraphics[width=0.28\textwidth]{figures/approach/base_application_screen_yes_no.png}}
\qquad
\subfloat[evaluation]
{\includegraphics[width=0.28\textwidth]{figures/approach/base_application_screen_eval.png}}
\caption[Base application screen shots]{\textbf{Base application screen shots.} (a) shows the apllication in default state, where the user has the opportunity to trigger a hand wash event manually. (b) shows the notification, which appears, when the application has detected a hand wash activity. Here the user can confirm or decline. (b) shows one of the evaluation queries which the user has to answer for the OCD observation. These are shown, if the user triggered a manual hand wash event or confirmed a detected hand washing activity.}
\label{fig:baseApplicationScreen}
\end{centering}
\end{figure}
......@@ -21,12 +21,12 @@
% Change to your first examiner
% The ~ enables non sentence spacing after a period
\newcommand{\firstexaminer}{Prof.~Dr.~Bugs Bunny}
\newcommand{\firstexaminer}{Dr.~Philipp Scholl}
% Change to your second examiner, some undergraduate studies don't have a second examiner
% in this case just comment out the following line
\newcommand{\secondexaminer}{Prof.~Dr.~Wile E. Coyote}
\newcommand{\secondexaminer}{Prof.~Marco Zimmerling}
% Change to your adivers
\newcommand{\advisers}{Terence Hill, Bud Spencer}
\newcommand{\advisers}{Dr.~Philipp Scholl}
% include all packages and define commands in setup.tex
\input{setup}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment