Commit cc90d980 authored by burcharr's avatar burcharr 💬
Browse files

automatic writing commit ...

parent 666be086
......@@ -1001,7 +1001,7 @@ Subject\_term\_id: computational-neuroscience;computational-science;computer-sci
booktitle = {2017 International Conference on Engineering and Technology ({ICET})},
author = {Albawi, Saad and Mohammed, Tareq Abed and Al-Zawi, Saad},
date = {2017-08},
keywords = {artificial neural networks, computer vision, Convolution, convolutional neural networks, Convolutional neural networks, deep learning, Feature extraction, Image edge detection, Image recognition, machine learning, Neurons},
keywords = {Feature extraction, deep learning, artificial neural networks, computer vision, Convolution, convolutional neural networks, Convolutional neural networks, Image edge detection, Image recognition, machine learning, Neurons},
file = {IEEE Xplore Full Text PDF:/home/robin/Zotero/storage/GI55HP88/Albawi et al. - 2017 - Understanding of a convolutional neural network.pdf:application/pdf;IEEE Xplore Abstract Record:/home/robin/Zotero/storage/JLJQ2QPK/8308186.html:text/html},
}
......@@ -1018,7 +1018,7 @@ Subject\_term\_id: computational-neuroscience;computational-science;computer-sci
author = {Lawrence, S. and Giles, C.L. and Tsoi, Ah Chung and Back, A.D.},
date = {1997-01},
note = {Conference Name: {IEEE} Transactions on Neural Networks},
keywords = {Face recognition, Feature extraction, Humans, Image databases, Image sampling, Karhunen-Loeve transforms, Multilayer perceptrons, Neural networks, Quantization, Spatial databases},
keywords = {Neural networks, Humans, Feature extraction, Face recognition, Image databases, Image sampling, Karhunen-Loeve transforms, Multilayer perceptrons, Quantization, Spatial databases},
file = {IEEE Xplore Full Text PDF:/home/robin/Zotero/storage/I72QVX2C/Lawrence et al. - 1997 - Face recognition a convolutional neural-network a.pdf:application/pdf;IEEE Xplore Abstract Record:/home/robin/Zotero/storage/2ALTIFHU/554195.html:text/html},
}
......@@ -1047,7 +1047,7 @@ Subject\_term\_id: computational-neuroscience;computational-science;computer-sci
booktitle = {10th International Conference on Pattern Recognition [1990] Proceedings},
author = {Le Cun, Y. and Matan, O. and Boser, B. and Denker, J.S. and Henderson, D. and Howard, R.E. and Hubbard, W. and Jacket, L.D. and Baird, H.S.},
date = {1990-06},
keywords = {Computer networks, Data mining, Feature extraction, Handwriting recognition, Neural networks, Nonhomogeneous media, Pattern recognition, Postal services, Spatial databases, Testing},
keywords = {Neural networks, Feature extraction, Spatial databases, Computer networks, Data mining, Handwriting recognition, Nonhomogeneous media, Pattern recognition, Postal services, Testing},
file = {IEEE Xplore Abstract Record:/home/robin/Zotero/storage/LEHAD9HH/119325.html:text/html;IEEE Xplore Full Text PDF:/home/robin/Zotero/storage/5NV33HI6/Le Cun et al. - 1990 - Handwritten zip code recognition with multilayer n.pdf:application/pdf},
}
......@@ -1061,4 +1061,29 @@ Subject\_term\_id: computational-neuroscience;computational-science;computer-sci
urldate = {2021-11-03},
date = {2012},
file = {Full Text PDF:/home/robin/Zotero/storage/MRFGVD3R/Krizhevsky et al. - 2012 - ImageNet Classification with Deep Convolutional Ne.pdf:application/pdf},
}
@online{scholl_pyav_2021,
title = {{PyAV}},
rights = {{BSD}-3-Clause},
url = {https://github.com/pscholl/PyAV},
abstract = {Pythonic bindings for {FFmpeg}'s libraries.},
author = {Scholl, Philipp M.},
urldate = {2021-11-07},
date = {2021-05-05},
note = {original-date: 2019-03-25T14:00:49Z},
}
@article{szegedy_rethinking_2015,
title = {Rethinking the Inception Architecture for Computer Vision},
url = {http://arxiv.org/abs/1512.00567},
abstract = {Convolutional networks are at the core of most state-of-the-art computer vision solutions for a wide variety of tasks. Since 2014 very deep convolutional networks started to become mainstream, yielding substantial gains in various benchmarks. Although increased model size and computational cost tend to translate to immediate quality gains for most tasks (as long as enough labeled data is provided for training), computational efficiency and low parameter count are still enabling factors for various use cases such as mobile vision and big-data scenarios. Here we explore ways to scale up networks in ways that aim at utilizing the added computation as efficiently as possible by suitably factorized convolutions and aggressive regularization. We benchmark our methods on the {ILSVRC} 2012 classification challenge validation set demonstrate substantial gains over the state of the art: 21.2\% top-1 and 5.6\% top-5 error for single frame evaluation using a network with a computational cost of 5 billion multiply-adds per inference and with using less than 25 million parameters. With an ensemble of 4 models and multi-crop evaluation, we report 3.5\% top-5 error on the validation set (3.6\% error on the test set) and 17.3\% top-1 error on the validation set.},
journaltitle = {{arXiv}:1512.00567 [cs]},
author = {Szegedy, Christian and Vanhoucke, Vincent and Ioffe, Sergey and Shlens, Jonathon and Wojna, Zbigniew},
urldate = {2021-11-07},
date = {2015-12-11},
eprinttype = {arxiv},
eprint = {1512.00567},
keywords = {Computer Science - Computer Vision and Pattern Recognition},
file = {arXiv Fulltext PDF:/home/robin/Zotero/storage/NEA2CRUI/Szegedy et al. - 2015 - Rethinking the Inception Architecture for Computer.pdf:application/pdf;arXiv.org Snapshot:/home/robin/Zotero/storage/7BBTB7CG/1512.html:text/html},
}
\ No newline at end of file
......@@ -3,66 +3,66 @@
## Discussion of Results
## Discussion of results
In this section, the results of our hand washing detection system evaluation will be discussed. We summarize the conclusions we draw from these results and give an overview over limitations and possible future improvements of our approach.
### Theoretical evaluation
The results of the theoretical evaluation show, that for each of the defined problems, the neural network based methods can learn to classify the desired different activities with high accuracy. However, there are differences in the difficulty of the problems, and the resulting F1 scores and S scores are not yet perfect, which means that there could be room for improvement.
##### Problem 1
For the problem of classifying hand washing and separating it from all other activities, the raw predictions of the networks without smoothing reached an F1 score of $0.853$ (DeepConvLSTM) and an S score of $0.758$ (DeepConvLSTM-A). The DeepConvLSTM and DeepConvLSTM-A surpass all the other models that we tested, including the baselines RFC, SVM, majority classifier and random classifier. The baselines are surpassed by large margins. This is in line with related work on other human activity recognition tasks, where DeepConvLSTM and DeepConvLSTM with small modifications also achieved the best results. On this specific problem, the CNN model also needs to be mentioned, because its performance was worse, but not far from the DeepConvLSTM-based models.
#### Problem 1
For the problem of classifying hand washing and separating it from all other activities, the raw predictions of the networks without smoothing reached an F1 score of $0.853$ (DeepConvLSTM) and an S score of $0.758$ (DeepConvLSTM-A). The DeepConvLSTM and DeepConvLSTM-A surpass all the other models that we tested, including the baselines RFC, SVM, majority classifier and random classifier. The baselines are surpassed by large margins. This is in line with related work on other human activity recognition tasks, where DeepConvLSTM and DeepConvLSTM with small modifications also achieved the best results. On this specific problem, the CNN model also needs to be mentioned, because its performance was worse, but not far from the DeepConvLSTM-based models. Apart from the neural networks' general superiority, one reason for the worse performance of the classical baselines RFC and SVM could be the imbalanced data set. For the baselines, we did not include class weighting or other means of coping with the imbalance, which could explain some part of the worsened performance compared to the neural network based methods. Nevertheless, it is still reasonable to assume that our methods would still have beat the baselines with a large margin, if we had applied such measures, as the performance difference was significant.
The application of smoothing improved the performance of the models even further, to an F1 score of $0.892$ (DeepConvLSTM) and an S score of $0.819$ (DeepConvLSTM-A). This performance boost by smoothing can be explained by the temporal context captured in the data. It is clear, that if many windows in rapid succession are classified as hand washing, it is likely that a small amount of wrong predictions of the Null class appear. The smoothing helps to both filter out false positives and false negatives.
Normalization was shown to be ineffective for our approach, worsening the performance of almost all models. This could be due to the difference in distribution in the train and test set. The parameters for normalization were estimated from the train set and applied to the test set, which can always be inaccurate, because we assume that train and test set have the same distributions. This was not the case here, which is probably why the normalized data was harder to learn and test on, than the non-normalized data.
For the reasons explained in section \ref{s_score}, we weigh the results of the S score higher than the ones of the F1 score. Thus, the best network for problem 1 is DeepConvLSTM-A, although only by a slight margin. The overall achieved S score of $0.819$ is based on reaching a specificity of $0.751$ and a sensitivity of $0.90$, which means that $90\,\%$ of windows containing hand washing were classified as hand washing correctly. However, $75.1\,\%$ of windows classified as Null really contained no hand washing, which leaves some room for improvement, because this means that the model still has a false positive rate of $24.9\,\%$, which is a lot more than desired.
For the reasons explained in section \ref{s_score}, we weigh the results of the S score higher than the ones of the F1 score. Thus, the best network for problem 1 is DeepConvLSTM-A, although only by a slight margin. The overall achieved S score of $0.819$ is based on reaching a specificity of $0.751$ and a sensitivity of $0.90$, which means that $90\,\%$ of windows containing hand washing were classified as hand washing correctly. However, $75.1\,\%$ of windows that contained no hand washing were classified as Null, which leaves some room for improvement, because this means that the model still has a false positive rate of $24.9\,\%$, which is more than desired.
The binarized versions of the models trained on problem 3 achieve a notable success in terms of their similar F1 scores to the models trained for problem 1. However their performance in terms of the S score metric is worse by about $0.052$ for the best and by more for the other models. Therefore, and especially because of the higher importance of the S score, the models trained on problem 3 are not as good at classifying hand washing and separating it from other activities, as the models specifically trained for this problem. This lower performance can be explained with the higher difficulty of the 3-class problem learned by the classifiers trained for model 3. Thus the loss in performance was to be expected.
Compared to results obtained by Mondol et al. in HAWAD @sayeed_mondol_hawad_2020, with F1 scores over $90\,\%$, it may look like our approach provides weaker results. Their detection of samples, that are out of distribution sounds like a good idea in theory. However, we must argue that their results and our results are not entirely comparable, because we did not train or evaluate on the same data as they did. Added to that, from what they report in their paper, they did not split the data by subjects, but rather by data windows, with random sampling. This means that, during training, their model saw data from all subjects, including the subjects whose data they later tested on. Although this is not technically a leak from train to test set, our approach of splitting by subjects can be expected to deliver a better estimate of the generalization performance, because our models' (over-)adaptation to the specific subjects styles or patterns of hand washing cannot yield a performance boost on unseen subjects. Nevertheless, the detection of out of distribution samples could possibly increase the performance of our models. Still, one has to keep in mind, that a sample being out of distribution does not always mean, that it cannot be hand washing, especially if we test on unseen subjects who might arguably employ different patterns of motion during hand washing. For these reasons the comparability of the results seems rather low, with the performance of HAWAD likely being overestimated in comparison to our scenario. According to our findings, a fully connected network cannot reach the same level of performance as the DeepConvLSTM-based models.
Compared to results obtained by Mondol et al. in HAWAD @sayeed_mondol_hawad_2020, with F1 scores over $90\,\%$, it may look like our approach provides weaker results. Their detection of samples, that are out of distribution sounds like a good idea in theory. However, we must argue that their results and our results are not entirely comparable, because we did not train or evaluate on the same data as they did. Added to that, from what they report in their paper, they did not split the data by subjects, but rather by data windows, with random sampling. This means that, during training, their model saw data from all subjects, including the subjects whose data they later tested on. Although this is not technically a leak from the training set to the test set, our approach of splitting by subjects can be expected to deliver a better estimate of the generalization performance, because our models' (over-)adaptation to the specific subjects styles or patterns of hand washing cannot yield a performance boost on unseen subjects. Nevertheless, the detection of out of distribution samples could possibly increase the performance of our models. Still, one has to keep in mind, that a sample being out of distribution does not always mean, that it cannot be hand washing, especially if we test on unseen subjects who might arguably employ different patterns of motion during hand washing. For these reasons the comparability of the results seems rather low, with the performance of HAWAD likely being overestimated in comparison to our scenario. According to our findings, a fully connected network cannot reach the same level of performance as the DeepConvLSTM-based models.
##### Problem 2
The problem of classifying compulsive hand washing and distinguishing it from non-compulsive hand washing may seem to be more difficult than problem 1 from an outsiders perspective. Distinguishing different, but closely related, types of hand washing should be more difficult than telling apart hand washing from all other activities. However, the results of problem 2 seem to be proving the opposite, as significantly higher F1 scores and S scores are reached. For the raw predictions, F1 scores of around $0.92$ are reached by the LSTM, and DeepConvLSTM(-A). An S score of $0.869$ is reached by DeepConvLSTM. The classic machine learning methods SVM and RFC also reach good F1 scores near $0.89$, but significantly lower S scores below $0.735$. The different metrics' scores reached without smoothing are both higher than the scores reached with smoothing for problem 1, indicating that classifying compulsive hand washing and distinguishing it from non-compulsive hand washing could be learned better than classifying hand washing and separating it from all other activities. This property could stem from the much smaller amount of data used for problem 2, as we only included hand washing data here, which was taken from our own data sets, rather than from the external ones. The heterogeneity of the data set for problem 1 compared to problem 2 probably makes the network training harder in problem 1. Furthermore, the data for problem 1 includes a lot of activities, which must implicitly be mapped to the null class by the network, while problem 2 only knows two activities. The imbalance of the data used for problem 2 ($65\,\%$ positive samples, $35\,\%$ negative samples) is also smaller than the one of problem 1. However it is unsure, if this had an effect on the performance of the neural network based methods, as we used the weighted loss function to combat this problem.
#### Problem 2
The problem of classifying compulsive hand washing and distinguishing it from non-compulsive hand washing may seem to be more difficult than problem 1 from an outsiders perspective. Distinguishing different, but closely related, types of hand washing should be more difficult than telling apart hand washing from all other activities. However, the results of problem 2 seem to be proving the opposite, as significantly higher F1 scores and S scores are reached. For the raw predictions, F1 scores of around $0.92$ are reached by the LSTM, and DeepConvLSTM(-A). An S score of $0.869$ is reached by DeepConvLSTM. The classic machine learning methods SVM and RFC also reach good F1 scores near $0.89$, but significantly lower S scores below $0.735$, likely due to their problems with the class imbalance. The different metrics' scores reached without smoothing are both higher than the scores reached with smoothing for problem 1, indicating that classifying compulsive hand washing and distinguishing it from non-compulsive hand washing could be learned better than classifying hand washing and separating it from all other activities. This property could stem from the much smaller amount of data used for problem 2, as we only included hand washing data here, which was taken from our own data sets, rather than from the external ones. The heterogeneity of the data set for problem 1 compared to problem 2 probably makes the network training harder in problem 1. Furthermore, the data for problem 1 includes a lot of activities, which must implicitly be mapped to the null class by the network, while problem 2 only knows two activities. The imbalance of the data used for problem 2 ($65\,\%$ positive samples, $35\,\%$ negative samples) is also smaller than the one of problem 1. However it is unsure, if this had an effect on the performance of the neural network based methods, as we used the weighted loss function to combat this problem.
Like for problem 1, normalizing the data did not lead to a better performance, but rather to a decrease of the performance. The reasons are assumed to be the same reasons as for problem 1.
The results of problem 2 with the application of smoothing look even more promising. A very high F1 score of $0.966$ and an S score of $0.911$ are reached by DeepConvLSTM-A. The sensitivity reaches a value of $0.997$, while the specificity is $0.839$. Added to that, DeepConvLSTM and LSTM, as well as LSTM-A are all able to reach very similar performance levels. DeepConvLSTM-A has the highest performance values, but because it is so close, the difference is insignificant. The performance is even better than the performance without smoothing and hints that the detection of compulsive hand washing in a hand washing scenario is feasible. However, at this point we must stress again that the compulsive hand washing data used in this work is actually only simulated. Although the simulation was supervised with the help of expert psychologists, there is no guarantee, that real compulsive hand washing is distinguishable with the same level of performance. Arguably, even if the real difference from hand washing to compulsive hand washing was lower than in our data, the performance could still be high enough for a satisfactory separation of hand washing and compulsive hand washing. Is is likely that once there is enough data from OCD patients available, the models mentioned could be trained to detect non-simulated compulsive washing with high accuracy.
The results of problem 2 with the application of smoothing look even more promising. A very high F1 score of $0.966$ and an S score of $0.911$ are reached by DeepConvLSTM-A. The sensitivity reaches a value of $0.997$, while the specificity is $0.839$. Added to that, DeepConvLSTM and LSTM, as well as LSTM-A are all able to reach very similar performance levels. DeepConvLSTM-A has the highest performance values, but because it is so close, the difference is insignificant. The performance is even better than the performance without smoothing and hints that the detection of compulsive hand washing in a hand washing scenario is feasible. However, at this point we must stress again that the compulsive hand washing data used in this work is actually only simulated. Although the simulation was supervised with the help of expert psychologists, there is no guarantee, that real compulsive hand washing is distinguishable with the same level of performance. Arguably, even if the real difference from hand washing to compulsive hand washing was lower than in our data, the performance could still be high enough for a satisfactory separation of hand washing and compulsive hand washing. It is likely that once there is enough data from OCD patients available, the models mentioned could be trained to detect non-simulated compulsive washing with high accuracy.
As there is no published previous work in the area of automatically detecting compulsive hand washing, the results cannot be compared to already achieved results. The strong performance levels indicate a high probability for the approach being applicable in real-world testing. Sadly, as our work's real-world evaluation was limited to the evaluation of the best model for problem 1, we cannot report real-world results to test this hypothesis.
As there is no published previous work in the area of automatically detecting compulsive hand washing, the results cannot be compared to already achieved results. The strong performance levels indicate a high probability for the approach being applicable in a real-world scenario. Sadly, as our work's real-world evaluation was limited to the evaluation of the best model for problem 1, we cannot report real-world results to test this hypothesis.
##### Problem 3
The problem of classifying hand washing and compulsive hand washing separately and distinguishing both from other activities at the same time is arguably harder than the other two problems. Problem 3 can be seen as the unification of problem 1 and problem 2, namely classifying whether and activity is hand washing (problem 1) and, if yes, whether said washing activity is compulsive hand washing (problem 2). By being this 3-class classification problem, problem 3 is thus more difficult and has more room for errors than the other two problems. Thus, a lower level of performance must be expected.
#### Problem 3
The problem of classifying hand washing and compulsive hand washing separately and distinguishing both from other activities at the same time is arguably harder than the other two problems. Problem 3 can be seen as a combination of problem 1 and problem 2, namely classifying whether and activity is hand washing (problem 1) and, if yes, whether said washing activity is compulsive hand washing (problem 2). By being this 3-class classification problem, problem 3 is thus more difficult and has more room for errors than the other two problems. As a consequence, a lower level of performance must be expected.
Out of the models distinctively trained on problem 3, DeepConvLSTM-A performed best with a multiclass F1 Score of $0.692$, a multiclass S score of $0.769$ and a mean diagonal value of the confusion matrix $0.712$. DeepConvLSTM achieved a slightly lower, but nearly as good performance. For this problem, the baseline classic machine learning methods performed much worse, with their multiclass F1 and S scores, as well as their mean diagonal values of the confusion matrix being in the range of around $0.5$.
Added to the models trained on problem 3, we also report the performance of a chained model, that consists of the two best performing models for problem 1 and problem 2. Due to problem 3 being the combination of problem 1 and problem 2, the chained model can be used to make the same predictions. The chained model we use consists of DeepConvLSTM-A from problem 1 and DeepConvLSTM from problem 2, as those were the best performing models in terms of non-smoothed predictions and the S score for these two problems. The chained model reached an even higher performance, with a multiclass F1 Score of $0.714$, a multiclass S score of $0.783$ and a mean diagonal value of the confusion matrix $0.718$. This result is valuable, because it shows that the classifiers trained for problem 1 and problem 2 can outperform a classifier specifically trained for problem 3. This indicates that the sub-problems of problem 3 can more easily be solved independently, than it is to solve problem 3 directly. The downside of using the two networks is, that they take twice the time to train, twice the time and energy to run and twice the memory or storage and thus are less efficient. Especially on the smart watch or any embedded mobile device the models could be deployed on, this could be a big disadvantage compared to the single model trained for problem 3. The performance difference is significant, but not by a large margin. The difference of $0.03$ in the multiclass S score is so small, that it could well be indistinguishable for real-world users.
We did not apply smoothing for problem 3, but it could be done in theory, using a slightly different approach, and it may also improve the performance of the system.
We did not apply smoothing for problem 3, but it could be done in theory, using a slightly different approach, and it may also improve the performance of the system. One could use a similar sliding-filter method, but instead of taking the running mean, assign each sample to the class that is most frequent in the predictions of a certain number of neighboring predictions.
Because of the preliminary results, we did not apply normalization to the chained model, but we tested all the other models for problem 3 using it and got results matching the ones for the other 2 problems. Normalization did not help. We assume the same reasons as mentioned in the discussion of problem 1.
To conclude the results of problem 3, the overall performance of this more difficult problem is worse than the performances for problem 1 and problem 2. However, if we are willing to trade-off efficiency versus performance in the metrics used, a chained model consisting of two models trained on the sub-problems problem 1 and problem 2 can be applied to receive a performance boost.
To conclude the results of problem 3, the overall performance of our models on this more difficult problem is worse than the performances for problem 1 and problem 2. However, if we are willing to trade-off efficiency versus performance in the metrics used, a chained model consisting of two models trained on the sub-problems problem 1 and problem 2 can be applied to receive a performance boost.
### Practical applicability
The data from the real-world evaluation with our test subjects shows, that not all real-world hand washing procedures are detected by our smart watch system. Overall, the system's sensitivity was only $28.33\,\%$ in the evaluation of a "normal day", which is much lower compared to the theoretical results. However, this was to be expected to some degree, since real hand washing knows many forms and patterns, that are unlikely to all be captured during the explicit recording of training data. Added to that, the hand washing detection depended on the side of the body, on which the watch was worn, at least for some subjects. The performance was significantly worse if the watch was worn on the right wrist. This is likely due to the hand washing data used for training being collected almost exclusively with smart watches worn on the left wrist. If the data from subjects wearing the watch on the right wrist is left out, the overall detection sensitivity rises to $50\,\%$.
For 2 subjects, the smart watch application did not work properly, i.e. it did not start to run in the background as desired, which is why their results could not be included in the reported results. However, it could be possible, that other users' smart watch applications also were inactive for some of the time, possibly missing some hand washing procedures during this time.
For one additional subject, the smart watch application did not work properly, i.e. it did not start to run in the background as desired, which is why their results could not be included in the reported results. However, it could be possible, that other users' smart watch applications also were inactive for some of the time, possibly missing some hand washing procedures during this time.
Because of the smoothing that was applied to the data, in order to make a prediction, at least some consecutive windows must be classified into the positive class, which means that a real hand washing procedure needs to be longer than or around $10\,s$. In practice, it can happen that washing ones hands does take a shorter amount of time, which the system will then not detect properly. It is even enough, if for some period of time in the middle of a washing procedure, the washing intensity is small enough for the model to misclassify it as noise.
It is not entirely clear, why the theoretical results could not be reached entirely in the real life scenario. It could be due to the assumptions made during the recording of the data sets, i.e. the way the hands were washed during the recordings could be too different from unbiased real-world washing. In order to improve the performance in the real-world, further research has to be conducted. All in all, the system was able to correctly detect most hand washing procedures and is therefore somewhat effective at this task.
It is not entirely clear, why the theoretical results could not be fully reached in the real life scenario. It could be due to the assumptions made during the recording of the data sets, i.e. the way the hands were washed during the recordings could be too different from unbiased real-world washing. In order to improve the performance in the real-world, further research has to be conducted. All in all, the system was able to correctly detect most hand washing procedures and is therefore somewhat effective at this task.
We also expected that a higher intensity or a longer duration of the hand washing would have a positive influence on the detection probability by the model on the smart watch. This seems logical for the longer duration due to the smoothing, but also for the intensity, due to the higher sensor values. It can be assumed, that the system can reach higher certainties with high intensity compared to low intensity washing, as it is likely more separable from less intense activities. However, the results showed a significantly positive correlation value only for intensity and detection rate ($0.267$), whereas the detection rate and hand washing duration seemed to be mostly uncorrelated ($-0.039$). However, this may again be due to the relatively small sample size. Especially for the longer washing tasks of 30s and 35s, there were only 2 examples, which were both not detected. This may have had a big influence on the absence of a positive correlation value in the evaluation results.
Added to that, the system did detect an average of 4 false positives per subject per day. These false positives could lead to annoyances and ultimately to the users losing trust in the detection capabilities of the system. However, the amount found here in the everyday task also varied a lot from subject to subject. Mainly, washing activities lead to false positives, which was to be expected, because similar movements like in hand washing are executed. Other activities also lead to false positives, which also confirmed that the theoretical results' high, but not very high specificity does not lead to the total avoidance of false positives.
The test of scenario 2, the task of intensively washing for at least 30 seconds, yielded a lot higher accuracy. Per subject, the washing was on average detected in $76\,\%$ of washing repetitions. Compared to the sensitivity of $90\,\%$ reached for problem 1 with smoothing, this is only lower by $14$ percentage points. The discrepancy here is much lower than in the everyday scenario. This could be because the training data for hand washing procedures was also collected in a more controlled environment, and more similar patterns were achieved. The results of the evaluation for scenario 2 are thus better than the results for scenario 1.
The test of scenario 2, the task of intensively washing for at least 30 seconds, yielded a higher accuracy. Per subject, the washing was on average detected in $76\,\%$ of washing repetitions. Compared to the sensitivity of $90\,\%$ reached for problem 1 with smoothing, this is only lower by $14$ percentage points. The discrepancy here is much lower than in the everyday scenario. This could be because the training data for hand washing procedures was also collected in a more controlled environment, and more similar patterns were achieved. The intensive washing in the experiment likely also resembles the intensive washing recorded as simulated obsessive hand washing. The results of the evaluation for scenario 2 are thus better than the results for scenario 1.
In total, the practical evaluation showed some weaknesses and some strengths of the system. As the sample size is small, and system instabilities occurred, the results have to be interpreted carefully. The evaluation is valid, especially for the false positives and the activities provoking them. However, the low sensitivity found in the everyday task does not match the much higher sensitivity found in the intensive hand washing task, and the differences between subjects were huge for scenario 1. Part of the reason for this is the difference in performance on the left and right wrists respectively.
......@@ -78,20 +78,20 @@ The practical evaluation provided us with valuable feedback, showing us strength
## Future work
The general performance of our models on problem 2, distinguishing compulsive hand washing from non-compulsive hand washing, was high. The downside is, that this model is only applicable if we know, when the hand washing takes place. However, our results could be employed together with other tools that give us this knowledge about the user currently washing their hands. Examples for this are in development in our group, one of them being a soap dispenser with integrated proximity sensor. Added to that, Bluetooth beacons stationed near sinks can be used to let the smart watch know that the user is near a specific sink. Conductivity sensors on the users skin could be employed to detect a change of conductivity caused by the contact with tap water. One or more of these methods combined with our model trained for problem 2 could possibly be used to achieve a higher performance for the task of compulsive hand washing detection in the future.
The general performance of our models on problem 2, distinguishing compulsive hand washing from non-compulsive hand washing, was high. The downside is, that this model is only applicable if we know, when the hand washing takes place. However, our results could be employed together with other tools that give us this knowledge about the user currently washing their hands. Examples for this are in development in our group, one of them being a soap dispenser with an integrated proximity sensor. Added to that, Bluetooth beacons stationed near sinks can be used to let the smart watch know that the user is near a specific sink. Conductivity sensors on the users skin could be employed to detect a change of conductivity caused by the contact with tap water. One or more of these methods combined with our model trained for problem 2 could possibly be used to achieve a higher performance for the task of compulsive hand washing detection in the future.
The detection of hand washing could be incorporated into many devices, mainly wrist worn ones, like smart watches. In order to further improve the detection capabilities and accuracy, one would need to invest even more time into carefully designing and training better models. This work's architecture search could be expanded, and more parameter combinations could be tried out. For example, different types of layers, that have not been included in the architecture yet could be tried. Instead of normalizing data on the data set level, batch normalization could be used to try to make the networks faster and more stable.
Different attention mechanisms could be tried out on the hand washing data.
On top of that, all the other hyperparameters could be optimized better. Instead of manual hyperparameter optimization (HPO), more sophisticated versions of HPO could be employed, e.g. bayesian optimization. This could lead to better choices for the batch size, learning rate and other parameters. However, this may take a lot of time to run, as it is computationally expensive.
The current state of the system, especially for the classification of hand washing versus compulsive hand washing class looks promising for future work in this area. The collection of real obsessive-compulsive hand washing data would likely lead to the possible training of models capable of reliably classifying compulsive hand washing. Such models could then be tested on real-world subjects, and evaluated with them. If they perform well enough, they could aid psychologists and their patients with the treatment of compulsive hand washing. Like explained in the introduction, exposure and response prevention (ERP) is a viable treatment method, and interventions from a smart watch could possibly be used for response prevention. The exact design of the interventions and their actual usability forms another exciting problem field and is yet to be researched.
The current state of the system, especially for the classification of hand washing versus compulsive hand washing, looks promising for future work in this area. The collection of real obsessive-compulsive hand washing data would likely lead to the possible training of models capable of reliably classifying compulsive hand washing. Such models could then be tested on real-world subjects, and evaluated with them. If they perform well enough, they could aid psychologists and their patients with the treatment of compulsive hand washing. Like explained in the introduction, exposure and response prevention (ERP) is a viable treatment method, and interventions from a smart watch could possibly be used for response prevention. The exact design of the interventions and their actual usability forms another exciting problem field and is yet to be researched.
The hand washing detection should also work well on both wrists. Multiple solutions for the differences occurring between the two sides could be tried. One could separately train two models, each for one of the wrists. The downside of this is, that the system would also need to figure out, on which wrist it is worn, either automatically, or by user input. This leads to some added uncertainty. Another idea would be to just train a model on balanced data from both wrists, leading to a model that can possibly implicitly learn, which wrist the watch is worn on. No matter how we solve this problem, it seems like the watch position on the body must be accounted for in some way, possibly also needing more data, or specific position labels for the existing data.
The hand washing detection should also work well on both wrists. Multiple solutions for the differences occurring between the two sides could be tried. One could separately train two models, each for one of the wrists. The downside of this is, that the system would also need to figure out, on which wrist it is worn, either automatically, or by user input. This leads to some added uncertainty. Another idea would be to just train a model on balanced data from both wrists, leading to a model that can possibly implicitly learn, which wrist the watch is worn on. No matter how we solve this problem, it seems like the watch position on the body must be accounted for in some way, possibly also needing more data, or specific label information about the location of the sensors for the existing data.
More data could also be incorporated for the negative class, because more different activities should be included in the data. While the standard movement activities of walking, jogging, sitting, walking up and down stairs and some fitness activities were already included for this work, more special activities have not yet been included, possibly leading to the increased false positive rate in the real-world scenario. Although we already include day long recordings of long term, i.e. everyday activity data, the data set would quickly become huge if we would include more of these. As a result, it is likely necessary to manually record and include everyday activities, like washing plates or pans, cleaning, brushing teeth and more. It would be even more desirable to have access to a whole database of human activities and their recordings from body worn sensors, to be used as negative examples in the training of a hand washing detection model.
More data could also be incorporated for the negative class, because more different activities should be included in the data. While the standard movement activities of walking, jogging, sitting, walking up and down stairs and some fitness activities were already included for this work, more special activities have not yet been included, possibly leading to the increased false positive rate in the real-world scenario. Although we already include day long recordings of long term, i.e. everyday activity data, the data set would quickly become huge if we would include more of these, because they are very long. The everyday recordings are not very efficient, as a lot of data contained in them is just idleness, which is not very helpful in training a model. As a result, it is likely necessary to manually record and include everyday activities, like washing plates or pans, cleaning, brushing teeth and more. It would be even more desirable to have access to a whole database of human activities and their recordings from body worn sensors, to be used as negative examples in the training of a hand washing detection model.
Another way of generating more data would be to use data augmentation. This would work by copying existing data samples and adding small amounts of random noise on top of it, forming new samples with the same labels. Data augmentation is a non-expensive way of generating more data to train our neural networks. In our specific case, where the recording of new hand washing data of any kind takes a lot of effort, hand washing data could be generated in this way, in order to have more of it available to train the models.
Another way of generating more data would be to use data augmentation. This would work by copying existing data samples and adding small amounts of random noise on top of it, forming new samples with the same labels. Data augmentation is a non-expensive way of generating more data to train our neural networks. In our specific case, where the recording of new hand washing data of any kind takes a lot of effort, hand washing data could be generated in this way, in order to have more of it available to train the models. Especially data of compulsive hand washing could be augmented in such a way, as it is harder to obtain than recordings of ordinary hand washing.
To avoid false, positives, one could also try to do detection of out of distribution movements, similar to the HAWAD approach that we discussed. The application of this method must be carefully done, as we do not certainly know that all out of distribution samples are no hand washing. The applicability of this method needs to be tested thoroughly.
......@@ -104,9 +104,10 @@ All in all, a lot of future work could be done in the area of hand washing detec
# Conclusion
In this work, we described the development, training and evaluation of a powerful and accurate compulsive and non-compulsive hand washing detection system. The relevance of such a system was explained with its applications in the field of hygiene compliance enforcement (general hand washing), as well as in the field of possibly helping in the treatment of obsessive-compulsive disorder with compulsive hand washing.
We theoretically evaluated different designs of neural networks on three related problems of hand washing detection, including the separation of hand washing from other activities, the separation of hand washing from compulsive hand washing and the separation of hand washing from compulsive hand washing and from other activities at the same time. For this task, we used hand washing data, data of simulated compulsive hand washing, and data of other activities which was collected from publicly available data sets. After training and evaluation, we selected the best functioning system based on several metrics, including the F1 score and the harmonic mean of sensitivity and specificity, which we called S score. The dominating models, DeepConvLSTM and DeepConvLSTM-A were both based on a deep convolutional neural network joined with an LSTM layer. For DeepConvLSTM-A, which performed slightly better than DeepConvLSTM, we added an attention mechanism, in order to allow the model to flexibly focus on more relevant sections of its input. The designed models were able to beat baselines such as a random forest classifier and a support vector machine, as well as chance level baselines by a large margin.
We theoretically evaluated different designs of neural networks on three related problems of hand washing detection, including the separation of hand washing from other activities, the separation of hand washing from compulsive hand washing and the separation of hand washing from compulsive hand washing and from other activities at the same time. For this task, we used hand washing data, data of simulated compulsive hand washing, and data of other activities which was collected from publicly available data sets. After training and evaluation, we selected the best functioning system based on several metrics, including the F1 score and the harmonic mean of sensitivity and specificity, which we called S score. The dominating models, DeepConvLSTM and DeepConvLSTM-A were both based on a deep convolutional neural network joined with an LSTM layer. For DeepConvLSTM-A, which performed slightly better than DeepConvLSTM, we added an attention mechanism, in order to allow the model to flexibly focus on more relevant sections of its input. The addition of the attention mechanism lead to a small increase in performance.
The designed models were able to beat baselines such as a random forest classifier and a support vector machine, as well as chance level baselines by a large margin.
In a practical evaluation using 5 subjects, we tested DeepConvLSTM-A on the hand washing detection task in a real-world and everyday environment, as well as in a fixed schedule hand washing test. The system ran on a smart watch, which was used to monitor the users wrist movements in real-time and tried to correctly detect hand washing. The sensitivity of this test was lower than expected ($28,33\,\%$), ($50\,\%$ if the correct wrist was used). Furthermore, around 4 false positives per day appeared for different activities, many of which were washing related. They included but were not limited to doing the dishes, brushing ones teeth and scratching oneself. High amounts of false positives could be ruled out in the future, by adding more everyday activities to the training data.
In a practical evaluation using 5 subjects, we tested DeepConvLSTM-A on the hand washing detection task in a real-world and everyday environment, as well as in a fixed schedule hand washing test. The system ran on a smart watch, which was used to monitor the users wrist movements in real-time and tried to correctly detect hand washing. The sensitivity of this test was lower than expected ($28,33\,\%$), ($50\,\%$ if the correct wrist was used). Furthermore, around 4 false positives per day appeared for different activities, many of which were washing related. They included but were not limited to doing the dishes, brushing one's teeth and scratching oneself. High amounts of false positives could be ruled out in the future, by adding more everyday activities to the training data.
In the second test of the practical evaluation, subjects performed intensive and long hand washing repetitions, which were closer to our lab recorded washing data (including the simulated compulsive data) and thus easier to detect. The system's performance here was much closer to the results of the theoretical evaluation of our models sensitivity ($76\,\%$ vs $90\,\%$ and $82,5\,\%$ if the correct wrist was used).
......
......@@ -14,28 +14,28 @@ Added to that, hand washing using soap or disinfectants is also part of the work
In order to monitor the effectiveness and frequency of hand washing, we could use a sensor-based computer system to detect the activity of hand washing and its duration. Further advanced systems could also be used to predict the quality of the hand washing. These systems could then be used to reduce the risk of contaminations or infections by ameliorating the hygiene of their users.
### Obsessive-Compulsive Disorders
While it is usually helpful and a basic part of hygiene, hand washing can also be overdone, i.e. be too frequent or be done too thoroughly. One example of persons for which overly excessive hand washing is a problem, is the small percentage of humans suffering from Obsessive-Compulsive Disorders (OCD). OCD affects about $1-3\,\%$ of humans during their life @valleni-basile_frequency_1994, @fawcett_women_2020. OCD appears in the form of obsessions, that lead to compulsive behavior. There are multiple subgroups of obsessions and compulsions, including contamination concerns, symmetry and precision concerns, saving concerns and more @stein_obsessive-compulsive_2002. These concerns lead to respective compulsive behavior: Symmetry and precision concerns lead to arranging and ordering, saving concerns lead to hoarding and contamination concerns can lead to excessive washing, bathing and showering, including compulsive hand washing. This work will focus on detecting hand washing and will also try to tell apart hand washing from compulsive hand washing of OCD patients.
### Obsessive-Compulsive disorder
While it is usually helpful and a basic part of hygiene, hand washing can also be overdone, i.e. be too frequent or be done too thoroughly. One example of persons for which overly excessive hand washing is a problem, is the small percentage of humans suffering from Obsessive-Compulsive Disorder (OCD). OCD affects about $1-3\,\%$ of humans during their life @valleni-basile_frequency_1994, @fawcett_women_2020. OCD appears in the form of obsessions, that lead to compulsive behavior. There are multiple subgroups of obsessions and compulsions, including contamination concerns, symmetry and precision concerns, saving concerns and more @stein_obsessive-compulsive_2002. These concerns lead to respective compulsive behavior: Symmetry and precision concerns lead to arranging and ordering, saving concerns lead to hoarding and contamination concerns can lead to excessive washing, bathing and showering, including compulsive hand washing. This work will focus on detecting hand washing and will also try to tell apart hand washing from compulsive hand washing of OCD patients.
One method of treatment for clinical cases of OCD is exposure and response prevention (ERP) therapy @meyer_modification_1966 @whittal_treatment_2005. Using this method, patients that suffer from OCD are exposed to situations in which their obsessions are stimulated and they are helped at preventing compulsive reactions to the stimulation. The patients can then "get used" to the situation in a sense, and thus the reaction to the stimulation will be weakened over time. This means that their quality of life is improved, as the severity of their OCD declines.
A successful, i.e. reliable and accurate system for compulsive hand washing detection could be used to intervene, whenever the compulsive hand washing is detected. It could therefore help psychologists and their patients in the treatment of the symptoms. It could help the user to stop the compulsive behavior by issuing a warning. Such a warning could be a vibration of the device, or a sound that is played upon the detection of compulsive behavior. However, the hypothesis of usefulness is yet to be tested, as no such systems exists as of now. Therefore we want to develop a system that can not only detect hand washing with low latency and in real time, but also discriminate between usual hand washing and obsessive-compulsive hand washing at the same time. The system could then, as described, be used in ERP therapy sessions, but also in everyday life, to prevent compulsive hand washing.
A successful, i.e. reliable and accurate system for compulsive hand washing detection could be used to intervene, whenever the compulsive hand washing is detected. It could therefore help psychologists and their patients in the treatment of the symptoms. It could help the user to stop the compulsive behavior by issuing a warning. Such a warning could be a vibration of the device, or a sound that is played upon the detection of compulsive behavior. However, the hypothesis of usefulness is yet to be tested, as no such systems exists as of now. Therefore we want to develop a system that can not only detect hand washing with low latency and in real-time, but also discriminate between usual hand washing and obsessive-compulsive hand washing at the same time. The system could then, as described, be used in ERP therapy sessions, but also in everyday life, to prevent compulsive hand washing.
The separation of compulsive hand washing from ordinary hand washing is an even harder problem than just hand washing detection itself. It is unclear, whether it is possible to predict the type of hand washing with high probability, as there is no previous work in this area. It is reasonable to assume, that there are strong similarities between compulsive hand washing and non-compulsive hand washing, as well as subtle differences, e.g. in intensity and duration of the washing.
The separation of compulsive hand washing from ordinary hand washing could be an even harder problem than just hand washing detection itself. It is unclear, whether it is possible to predict the type of hand washing with high probability, as there is no previous work in this area. It is reasonable to assume, that there are strong similarities between compulsive hand washing and non-compulsive hand washing, as well as subtle differences, e.g. in intensity and duration of the washing.
### Wrist worn sensors
Different types of sensors can be used to detect activities such as hand washing. It is possible to detect hand washing from RGB camera data to some extent. However, for this to work, we would need to place a camera at every place and room a subject could want to wash their hands at. This is unfeasible for most applications of hand washing detection and could be very expensive. Furthermore, it might be problematic to place cameras inside wash- or bathrooms for privacy reasons. Thus, a better alternative could be body worn, camera-less devices.
Inertial measurement units (IMUs) can measure different types of time series movement data, e.g. the acceleration or angular velocity of the device they are embedded in. IMUs are embedded in most modern smart phones and smart watches, which makes them easily available. For hand washing detection, especially the movement of the hands and wrists can contain information that can help us classify hand washing. Therefore, we can use a smart watch and its embedded IMU to try to predict whether a user is washing their hands or not. Added to that, if the user is washing their hands, we could try to predict if they are washing them in an obsessive-compulsive way or not. Another advantage of using a smart watch would be, that they usually have in-built vibration motors or even speakers. These means could be used to intervene, whenever compulsive hand washing is detected, as described above. Therefore, wrist worn sensors, especially those embedded in smart watch systems, are used in this work. The wrist worn devices can also be used to execute machine learning models in real time, using publicly available libraries, e.g. on smart watches running Wear OS.
Inertial measurement units (IMUs) can measure different types of time series movement data, e.g. the acceleration or angular velocity of the device they are embedded in. IMUs are embedded in most modern smart phones and smart watches, which makes them easily available. For hand washing detection, especially the movement of the hands and wrists can contain information that can help us classify the activity. Therefore, we can use a smart watch and its embedded IMU to try to predict whether a user is washing their hands or not. Added to that, if the user is washing their hands, we could try to predict if they are washing them in an obsessive-compulsive way or not. Another advantage of using a smart watch would be, that they usually have in-built vibration motors or even speakers. These means could be used to intervene, whenever compulsive hand washing is detected, as described above. Therefore, wrist worn sensors, especially those embedded in smart watch systems, are used in this work. The wrist worn devices can also be used to execute machine learning models in real-time, using publicly available libraries, e.g. on smart watches running Wear OS.
## Goals
In this work, we want to develop several neural network based machine learning methods for the real time detection of hand washing and compulsive hand washing on inertial sensor data of wrist worn devices. We also want to test the methods and report meaningful statistics for their performance. Further, we want to test parts of the developed methods in a real-world scenario. We then want to draw conclusions on the applicability of the developed systems in the real-world.
In this work, we want to develop several neural network based machine learning methods for the real-time detection of hand washing and compulsive hand washing on inertial sensor data of wrist worn devices. We also want to test the methods and report meaningful statistics for their performance. Further, we want to test parts of the developed methods in a real-world scenario. We then want to draw conclusions on the applicability of the developed systems in the real-world.
### Detection of hand washing in real time utilizing inertial measurement sensors
We want to show that neural network based classification methods can be applied to the recognition of hand washing. We want to base our method on sensor data from inertial measurement sensors in smart watches or other wrist worn IMU-equipped devices. We want to detect the hand washing in real time and directly on the mobile, i.e. on a wrist wearable device, such as a smart watch. Doing so, we would be able to give instant real time feedback to the user of the device.
### Detection of hand washing in real-time utilizing inertial measurement sensors
We want to show that neural network based classification methods can be applied to the recognition of hand washing. We want to base our method on sensor data from inertial measurement sensors in smart watches or other wrist worn IMU-equipped devices. We want to detect the hand washing in real-time and directly on the mobile, i.e. on a wrist wearable device, such as a smart watch. Doing so, we would be able to give instant real-time feedback to the user of the device.
### Separation of hand washing and compulsive hand washing
On top of the detection of hand washing, the detection of obsessive-compulsive hand washing is part of our goals. We want to be able to separate compulsive hand washing from non-compulsive hand washing, based on inertial motion data. Especially for the scenario of possible interventions used for the treatment of OCD, this separation is crucial, as OCD patients do also wash their hands in non-compulsive ways and we do not want to intervene for these kinds of hand washing procedures.
On top of the detection of hand washing, the detection of obsessive-compulsive hand washing is part of our goals. We want to be able to separate compulsive hand washing from non-compulsive hand washing, based on inertial motion data. Especially for the scenario of possible interventions used for the treatment of OCD, this separation is crucial, as OCD patients do also wash their hands in non-compulsive ways and we do not want to intervene for these ordinary hand washing procedures.
### Real-world evaluation
We want to evaluate the most promising of the developed models in a real-world evaluation, in order to obtain a realistic estimate of its applicability in the task of hand washing detection. We want to report results of an evaluation with multiple subjects to obtain a meaningful performance estimation. From this estimation we want to draw conclusions on the applicability of the developed system in real world therapy scenarios. Added to that, we want to derive future improvements, that could be applied to the system.
We want to evaluate the most promising of the developed models in a real-world evaluation, in order to obtain a realistic estimate of its applicability in the task of hand washing detection. We want to report results of an evaluation with multiple subjects to obtain a meaningful performance estimation. From this estimation we want to draw conclusions on the applicability of the developed system in real-world therapy scenarios. Added to that, we want to derive future improvements, that could be applied to the system.
......@@ -34,6 +34,6 @@ reviewer2: "Prof. Dr. Thomas Brox"
declaration: Hiermit erkläre ich, dass ich diese Arbeit selbstständig verfasst habe, keine anderen als die angegebenen Quellen/Hilfsmittel verwendet habe und alle Stellen, die wörtlich oder sinngemäß aus veröffentlichten Schriften entnommen wurden, als solche kenntlich gemacht habe. Darüber hinaus erkläre ich, dass diese Arbeit nicht, auch nicht auszugsweise, bereits für eine andere Prüfung angefertigt wurde.
#abstract
abstract-de: Die automatische Erkennung von Händewaschen und zwanghaftem Händewaschen hat mehrere Anwendungsbereiche in Arbeits- und medizinischen Umgebungen. Die Erkennung kann zur Überprüfung der Einhaltung von Hygieneregeln eingesetzt werden, da das Händewaschen eine der wichtigsten Komponenten der persönlichen Hygiene ist. Allerdings kann das Waschen auch übertrieben werden, was bedeutet, dass es für die Haut und die allgemeine Gesundheit schädlich sein kann. Manche Patienten mit Zwangsstörungen waschen sich zwanghaft und zu häufig die Hände auf diese schädliche Weise. Die automatische Erkennung von zwanghaftem Händewaschen kann bei der Behandlung dieser Patienten helfen. Ziel dieser Arbeit ist es, auf neuronalen Netzen basierende Methoden zu entwickeln, die in der Lage sind, Händewaschen und zwanghaftes Händewaschen in Echtzeit auf einem am Handgelenk getragenen Gerät zu erkennen, wobei die Daten der Bewegungssensoren des am Handgelenk getragenen Geräts verwendet werden. Die entwickelte Methode erreicht eine hohe Genauigkeit für beide Aufgaben und Teile der Arbeit wurden mit Probanden in einem realen Experiment evaluiert, um die starke theoretische Leistung (F1 score von 89,2 % bzw. 96,6 %) zu bestätigen.
abstract-en: The automatic detection of hand washing and compulsive hand washing has multiple areas of application in work and medical environments. The detection can be used in compliance and hygiene scenarios, as hand washing is one of the main components of personal hygiene. However, the washing can also be overdone, which means it can be unhealthy for the skin and general health. Patients with obsessive-compulsive disorder sometimes compulsively wash their hands in such a harmful way. In order to help with their treatment, the automatic detection of compulsive hand washing can possibly be applied. This thesis aims to develop neural network based methods which are able to detect hand washing as well as compulsive hand washing in real time on a wrist worn device using inertial motion sensor data of said wrist worn device. We achieve high accuracy for both tasks and evaluate parts of the work on subjects in a real world experiment, in order to confirm the strong theoretical performance (F1 score of 89.2 % and 96.6 %) achieved.
abstract-de: Die automatische Erkennung von Händewaschen und zwanghaftem Händewaschen hat mehrere Anwendungsbereiche in Arbeitsumgebungen und im medizinischen Bereich. Die Erkennung kann zur Überprüfung der Einhaltung von Hygieneregeln eingesetzt werden, da das Händewaschen eine der wichtigsten Komponenten der persönlichen Hygiene ist. Allerdings kann das Waschen auch übertrieben werden, was bedeutet, dass es für die Haut und die allgemeine Gesundheit schädlich sein kann. Manche Patienten mit Zwangsstörungen waschen sich zwanghaft zu häufig und intensiv die Hände auf diese schädliche Weise. Die automatische Erkennung von zwanghaftem Händewaschen kann bei der Behandlung dieser Patienten helfen. Ziel dieser Arbeit ist es, auf neuronalen Netzen basierende Methoden zu entwickeln, die in der Lage sind, Händewaschen und zwanghaftes Händewaschen in Echtzeit auf einem am Handgelenk getragenen Gerät zu erkennen, wobei die Daten der Bewegungssensoren des Geräts verwendet werden. Die entwickelte Methode erreicht eine hohe Genauigkeit für beide Aufgaben. Sie erreicht einen F1 score von 89,2 % für die Erkennung von Händewaschen bzw. 96,6 % für die Erkennung von zwanghaftem Händewaschen. Teile der Arbeit wurden mit Probanden in einem realen Experiment evaluiert, um die starke theoretische Leistung zu bestätigen.
abstract-en: The automatic detection of hand washing and compulsive hand washing has multiple areas of application in work and medical environments. The detection can be used in compliance and hygiene scenarios, as hand washing is one of the main components of personal hygiene. However, the washing can also be overdone, which means it can be unhealthy for the skin and general health. Patients with obsessive-compulsive disorder sometimes compulsively wash their hands in such a harmful way. In order to help with their treatment, the automatic detection of compulsive hand washing can possibly be applied. This thesis aims to develop neural network based methods which are able to detect hand washing as well as compulsive hand washing in real time on a wrist-worn device using inertial motion sensor data of the device. We achieve high accuracy for both tasks. We reach an F1 score of 89.2 % for hand washing detection and 96.6 % for compulsive hand washing detection. We evaluate parts of the work on subjects in a real world experiment, in order to confirm the strong theoretical performance achieved.
---
......@@ -9,19 +9,19 @@ Added to that, we further explain the development and testing of different neura
Then we explain meaningful methods of evaluating the developed models and methods on both unseen pre-recorded data and with real-world subjects.
## Data set
In order to be able to train any machine learning algorithm, we need enough data that can be used to correctly train the used model. In our case of wrist motion data, we used acceleration and gyroscope time series data from multiple sources which will be explained below. The needed inertial data of each sensor is given as $\mathbf{s}_i \in \mathbb{R}^{d_i \times t}$, where $d_i$ is the dimensionality of the sensor (e.g. $d_{accelerometer} = 3$) and $t$ is the amount of samples in a time series. We use accelerometer and gyroscope data which both have 3 dimensions. We would have liked to use more available sensors, like the magnetometer available in many modern IMUs, but most external data sets do only include accelerometer and gyroscope data. We combine the two sensors we use into one data series of dimensionality $\mathbb{R}^{6 \times t}$. An example for the sensor data used in our experiments is shown in fig. \ref{fig:sensor_data}
In order to be able to train any machine learning algorithm, we need enough data that can be used to correctly train the used model. In our case of wrist motion data, we used acceleration and gyroscope time series data from multiple sources which will be explained below. The needed inertial data of each sensor is given as $\mathbf{s}_i \in \mathbb{R}^{d_i \times t}$, where $d_i$ is the dimensionality of the sensor (e.g. $d_{accelerometer} = 3$) and $t$ is the number of samples in a time series. We use accelerometer and gyroscope data which both have 3 dimensions. We would have liked to use more available sensors, like the magnetometer available in many modern IMUs, but most external data sets do only include accelerometer and gyroscope data. We combine the two sensors we use into one data series in $\mathbb{R}^{6 \times t}$. An example for the sensor data used in our experiments is shown in fig. \ref{fig:sensor_data}.
\begin{figure}[hp]
\begin{figure}[H]
\centering
\subfloat[Hand washing sensor data of one window (150 samples, 3s)]{\includegraphics[width=1.1\textwidth]{img/sample_data.pdf}}
\subfloat[Non hand washing sensor data of one window (150 samples, 3s)]{\includegraphics[width=1.1\textwidth]{img/sample_data_no_hw.pdf}}
\subfloat[Hand washing sensor data of one window (150 samples, 3s)]{\includegraphics[width=0.48\textwidth]{img/sample_data.pdf}}
\hfill
\subfloat[Non hand washing sensor data of one window (150 samples, 3s)]{\includegraphics[width=0.48\textwidth]{img/sample_data_no_hw.pdf}}
\caption{Example sensor data for hand washing and non hand washing activities}
\label{fig:sensor_data}
\end{figure}
We also need to pay attention to the sampling rate of the given data, i.e. at how many data points per second the data was recorded. The sampling should not be different for the distinct data-streams contained in the final data set. Differing timescales between data parts tested on models trained on one timescale will lead to a significant decrease in performance. In order to successfully run the machine learning training algorithms, the data must therefore have a jointly fixed sampling rate.
We also need to pay attention to the sampling rate of the given data, i.e. at how many data points per second the data was recorded. The sampling should not be different for the distinct data-streams contained in the final data set. Differing timescales between data parts tested on models trained on one timescale could lead to a significant decrease in performance. In order to successfully run the machine learning training algorithms, the data must therefore have a jointly fixed sampling rate.
### Data set requirements for (compulsive) hand washing detection
Our task of separating hand washing from non-hand washing activities is a difficult task for which pre-recorded training data is necessary. We split this in the activity classes of data needed:
......@@ -30,16 +30,16 @@ Our task of separating hand washing from non-hand washing activities is a diffic
2. compulsive hand washing
3. other activities
In order to correctly detect the hand washing in real time in a real-world scenario, the model must have access to real hand washing sensor data during training time. Thus, said data was recorded and labeled and is included in the data set. In order to add a "negative" class into the training set, we used data from everyday scenarios ("long term", i.e. all-day recordings), as well as labeled data from previously existing studies concerning gesture or activity recognition. Activities or gestures which are unknown to the system are harder to classify and will more likely be wrongly detected as hand washing. It is therefore desirable that as many non hand washing activities as possible are included in the training set in order to better separate hand washing from all the other activities. We wish to avoid false positive hand washing detections, which could annoy the user of the system and possibly lead to a decreased trust in the reliability of the system, lowering their response to all detections.
In order to correctly detect the hand washing in real time in a real-world scenario, the model must have access to real hand washing sensor data during training time. Thus, said data was recorded and labeled and is included in the data set. In order to add a "negative" class into the training set, we used data from everyday scenarios ("long term", i.e. all-day recordings), as well as labeled data from previously existing studies concerning gesture or activity recognition. Activities or gestures which are unknown to the system are harder to classify and will more likely be wrongly detected as hand washing. It is therefore desirable that as many non hand washing activities as possible are included in the training set in order to better separate hand washing from all the other activities. We wish to avoid false positive hand washing detections, which could annoy the user of the system and possibly lead to a decreased trust in the reliability of the system, lowering the user's response to future detections.
In order to also separate non-compulsive hand washing from compulsive hand washing, data of compulsive hand washing must be included. In order to record this data, real patients can be asked to wear a sensor during their daily life, but especially during hand washing.
In order to also separate non-compulsive hand washing from compulsive hand washing, data of compulsive hand washing must be included. In order to record this data, real patients can be asked to wear a sensor-equipped recording device during their daily life, but especially during hand washing.
### Data used in our data set
We used hand washing data and "compulsive" hand washing data recorded at the University of Basel and University of Freiburg as our "positive" class data. This data was recorded at several occasions and using different paradigms. We mainly used data recorded at $50\,$Hz, using a smart watch application. In 2019 and in 2020, data was recorded. The data from 2019 includes hand washing data and, added to that, also includes simulated "compulsive" hand washing. For the simulated compulsive hand washing, subjects were asked to "dirty" their hands with different substances, like finger paint or Nivea \textregistered\ creme, in order to serve as a motivation for intensive hand washing. Afterwards, they had to follow certain scripts of intensive hand washing steps. Each script contained several steps of washing, like interlacing the fingers, washing the fingers individually, washing the palms and more.
We used hand washing data and "compulsive" hand washing data recorded at the University of Basel and University of Freiburg as our "positive" class data. This data was recorded at several occasions and using different paradigms. We mainly used data that was recorded at $50\,$Hz, using a smart watch application. In 2019 and in 2020, data was recorded. The data from 2019 includes hand washing data and, added to that, also includes simulated "compulsive" hand washing. For the simulated compulsive hand washing, subjects were asked to "dirty" their hands with different substances, like finger paint or Nivea \textregistered\ creme, in order to serve as a motivation for intensive hand washing. Afterwards, they had to follow certain scripts of intensive hand washing steps. Each script contained several steps of washing, like interlacing the fingers, washing the fingers individually, washing the palms and more.
A part of the used gestures is shown in @fig:gestures.
![Examples of gestures used for the simulation of compulsive hand washing, by Phillip Scholl](img/gestures.jpg){width=98% #fig:gestures}
![Examples of gestures used for the simulation of compulsive hand washing, by Phillip Scholl](img/gestures.jpg){width=70% #fig:gestures}
For this work, we used the simulated compulsive hand washing data as compulsive hand washing data, as we did not have access to recordings of actual compulsive hand washing. Thus, when we write about the "compulsive" hand washing data we used in this thesis, the simulated compulsive hand washing is meant.
......@@ -63,8 +63,6 @@ No & Dataset name & Contained activities (excerpt)
\label{tbl:datasets}
\end{table}
\filbreak
The external data sets used are:
- WISDM @kwapisz_activity_2011
......@@ -72,7 +70,7 @@ The external data sets used are:
- REALDISP @banos_benchmark_2012
- PAMAP2 @reiss_introducing_2012
The external data sets were collected and converted by Daniel Homm, analyzed and resampled by us. Their contents can be seen in table \ref{tbl:datasets}. They mainly contain activities which involve a lot of movement, which we expect to be helpful in avoiding false positives, as explained above.
The external data sets were collected and converted by Daniel Homm, analyzed and resampled to $50\,$Hz by us. Their contents can be seen in table \ref{tbl:datasets}. They mainly contain activities which involve a lot of movement, which we expect to be helpful in avoiding false positives, as explained above.
### Specifications of the resulting data set used
......@@ -115,8 +113,7 @@ The usages for the classes are shown in Table \ref{table:classes}. As mentioned
![Sample distribution for the 3 problems by classes. The number in round brackets is the amount of windows in the data set used for each problem.](img/dataset_dist.pdf){width=98% #fig:sample_dist}
The sample distribution to the classes can be seen in @fig:sample_dist.
The data used for training and testing the models for the different problems differs due to the tasks requirements. Namely, the data for problem 2 only contains hand washing data and compulsive hand washing data. The Null class data is not contained in training and testing data for this problem. However, we still made sure that even across the different sets, the different subject recordings were only ever assigned to the same set out of the split into training set and test set. This means that we can execute the different classifiers trained for one of the problems on the test sets of the other problems without the possibility of accidentally testing on data previously seen by the classifier. By testing on the training set or parts thereof, the results would be invalidated, thus this property of our splits was desirable.
The data used for training and testing the models for the different problems differs due to the tasks requirements. Namely, the data for problem 2 only contains hand washing data and compulsive hand washing data. The Null class data is not contained in training and testing data for this problem. However, we still made sure that even across the different problems, the different subject recordings were only ever assigned to the same set out of the split into training set and test set. This means that we can execute the different classifiers trained for one of the problems on the test sets of the other problems without the possibility of accidentally testing on data previously seen by the classifier. By testing on the training set or parts thereof, the results would be invalidated, thus this property of our splits was desirable.
## Baselines
Baselines can be used to show that our approach outperforms classic and simple approaches to solving the problem.
......@@ -125,9 +122,9 @@ For each of the problems, we train the baselines with the same windows. For SVM
- mean
- median
- min
- max
- std
- minimum
- maximum
- standard deviation
- mean absolute value
The implementations of SVM and RFC in scikit-learn @pedregosa_scikit-learn_nodate are used. SVM and RFC are trained with the standard parameters in scikit-learn.
......@@ -139,13 +136,13 @@ As explained in Section \ref{section:har}, neural networks are the state-of-the-
### Preprocessing and train-test-split
The data sets were normalized separately to each be mean free and have a standard deviation of $1$. We also tried training all the models without normalization and realized, that the performance on the validation set was better without normalization, which is why we included both a normalized and a non-normalized version of the data in our experiment, in order to compare the performance on the test set.
The sensor values were parted in windows using a sliding window approach. After some testing and trying, a window length of $3s$ ($150$ samples) was fixed. We used an overlap of $50\,\%$ ($75$ samples). The external data sets were only used for training. The prerecorded data sets containing hand washing and simulated compulsive hand washing were used for training and testing.
We used a train-test-split of $85\,\%$ to $15\,\%$ but split the data between distinct subjects, in order to enforce that training and testing is executed on different subjects. This makes sure that the performance on the test set will give a good estimate of the generalization performance. As every person will wash their hands in a slightly different way, this generalization is needed in the real-world in order to also detect unseen but similar patterns of hand washing or compulsive hand washing. The sliding windows were only calculated after splitting for the train test split, to avoid leakage from the test set into the training set.
The sensor values were parted in windows using a sliding window approach. After some testing and trying, a window length of $3s$ ($150$ samples) was fixed. We used an overlap of $50\,\%$ ($75$ samples).
We used a train-test-split of $85\,\%$ to $15\,\%$ but split the data between distinct subjects, in order to enforce that training and testing is executed on different subjects. This makes sure that the performance on the test set will give a good estimate of the generalization performance. As every person will wash their hands in a slightly different way, this generalization is needed in the real-world in order to also detect unseen but similar patterns of hand washing or compulsive hand washing. The sliding windows were only calculated after splitting for the train-test-split, to avoid leakage from the test set into the training set.
For the training, a validation set is split off from the training data, its size also being $15\,\%$ of the training data. The validation set is part of the training set and can be used to evaluate a model's performance during the development and in order to oversee the success of the training process.
### Architectures
The architecture of a neural network decides its capacity as well as how well it can be trained and perform at test time. We tried out multiple different promising architectures which are listed below. We implemented all these neural networks in python using PyTorch @paszke_pytorch_2019. The architectures will be explained in this section. An overview of each architecture's graphical representation is shown in fig. \ref{fig:network_architectures}
The architecture of a neural network decides its capacity as well as how well it can be trained and perform at test time. We tried out multiple different promising architectures which are listed below. We implemented all these neural networks in python using PyTorch @paszke_pytorch_2019. The architectures will be explained in this section. An overview of each architecture's graphical representation is shown in fig. \ref{fig:network_architectures}.
\begin{figure}[hp]
\centering
......@@ -175,39 +172,39 @@ The architecture of a neural network decides its capacity as well as how well it
\end{figure}
#### Fully connected network (FC)
As a neural network baseline, we used a fully connected (FC) network with $4$ layers containing $64$ hidden units each. This network is not likely to deliver state of the art performance, but we still expect it to perform better than the "classic" machine learning algorithm baselines used. The network consists of around 70.000 parameters that need to be learned.
As a neural network baseline, we used a fully connected (FC) network with $4$ layers containing $64$ hidden units each. This network is not likely to deliver state-of-the-art performance, but we still expect it to perform better than the "classic" machine learning algorithm baselines used. It is also similar to the network used in HAWAD @sayeed_mondol_hawad_2020. The network consists of around 70.000 parameters that need to be learned.
#### Convolutional neural network (CNN)
The convolutional neural network (CNN) consists of multiple convolutional layers. By applying the convolutions on the time axis of the data, the convolutional neural network can become more invariant to where in the input a certain pattern appears (i.e. at the beginning or the end of a window). In general, convolutional neural networks use less parameters than similar fully connected networks and are thus easier to train. In our case, the CNN model consists of four 1d-convolutional layers with $64$ filters each, which convolve along the time axis. The kernel size is $9$, the stride is $1$. After the four convolutional layers, we apply one linear layer with an output size equal to the number of classes in our problem. This results in a total number of parameters equal to around 133.000.
The convolutional neural network (CNN) consists of multiple convolutional layers. By applying the convolutions on the time axis of the data, the convolutional neural network can become more invariant to where in the input a certain pattern appears (i.e. at the beginning or the end of a window). In general, convolutional neural networks use less parameters than similar fully connected networks reaching a similar capacity and are thus easier to train. In our case, the CNN model consists of four 1d-convolutional layers with $64$ filters each, which convolve along the time axis. The kernel size is $9$, the stride is $1$. After the four convolutional layers, we apply one linear layer with an output size equal to the number of classes in our problem. This results in a total number of parameters equal to around 133.000, which is more than the FC network, implying a higher capacity.
#### LSTM network
As explained in Section \ref{sec:LSTM}, the recurrent networks, especially those based on LSTMs, are able to selectively use time dependencies to achieve better predictions. In our LSTM network, we use two LSTM layers together with two fully connected layers applied before the LSTM layers and two fully connected layers after the LSTM layers, which form the classifier. The linear layers before the LSTMs have sizes $64$ and $32$ respectively, the LSTMs have a hidden size of $128$ each. The linear layers after the LSTM have size $32$ and the output size, $2$ or $3$ depending on the classification problem. This amounts to around 222.000 parameters.
As explained in Section \ref{sec:LSTM}, the recurrent networks, especially those based on LSTMs, are able to selectively use time dependencies to achieve better predictions. In our LSTM network, we use two LSTM layers together with two fully connected layers applied before the LSTM layers and two fully connected layers after the LSTM layers, which form the classifier. The linear layers before the LSTMs have sizes $64$ and $32$ respectively. The LSTMs have a hidden size of $128$ each. The linear layers after the LSTMs have size $32$ and the output size, $2$ or $3$ depending on the classification problem. This amounts to around 222.000 parameters.
#### LSTM with attention mechanism (LSTM-A)
Added to the simple LSTM model, we also implemented the LSTM with attention mechanism (LSTM-A) described in Section \ref{sec:LSTMA}, which was proposed by Zeng et al. @zeng_understanding_2018. The attention mechanism allows the network to dynamically focus to certain parts of the input, by weighing the sum over a time series' LSTM hidden states. In the LSTM-A model, we directly apply one LSTM layer on the inputs, and then use a linear layer to calculate the weights of each states' hidden step for the weighted sum. Afterwards, there are two linear layers that are used to classify the resulting representation, the first one having $64$ units and the second one being the output layer with size $2$ or $3$. We obtain a network with around 94.000 learnable parameters.
Added to the simple LSTM model, we also implemented the LSTM with attention mechanism (LSTM-A) described in Section \ref{sec:LSTMA}, which was proposed by Zeng et al. @zeng_understanding_2018. The attention mechanism allows the network to dynamically focus on certain parts of the input, by weighing the sum over a time series' LSTM hidden states. In the LSTM-A model, we directly apply one LSTM layer on the inputs, and then use a linear layer to calculate the weights of each states' hidden step for the weighted sum. Afterwards, there are two linear layers that are used to classify the resulting representation, the first one having $64$ units and the second one being the output layer with size $2$ or $3$. We obtain a network with around 94.000 learnable parameters.
#### DeepConvLSTM
The DeepConvLSTM and its modifications are considered state of the art in Human Activity Recognition tasks. We are applying our implementation to the hand washing classification problem. DeepConvLSTM combines the advantages of convolutional layers and LSTMs. We implement it using the original design with four convolutional layers followed by two LSTM layers and a fully connected classification layer. Similar to the convolutional neural network, we use $64$ filters in each of the layers, a kernel size of $9$ and a stride of $1$. During preliminary testing, leaving out one LSTM layer like proposed by Bock et al. @bock_improving_2021 did not yield a significantly different performance. Thus we use two layers like it was done in the original study. The LSTM layers each have a hidden size of $128$. The classification layer has output size $2$ or $3$. This results in a network with around 346.000 learnable parameters.
The DeepConvLSTM and its modifications are considered state-of-the-art in human activity recognition tasks. We are applying our implementation to the hand washing classification problem. DeepConvLSTM combines the advantages of convolutional layers and LSTMs. We implement it using the original design with four convolutional layers followed by two LSTM layers and a fully connected classification layer. Similar to the convolutional neural network, we use $64$ filters in each of the layers, a kernel size of $9$ and a stride of $1$. During preliminary testing, leaving out one LSTM layer like proposed by Bock et al. @bock_improving_2021 did not yield a significantly different performance. Thus we use two layers like it was done in the original study. The LSTM layers each have a hidden size of $128$. The classification layer has output size $2$ or $3$. This results in a network with around 346.000 learnable parameters.
#### DeepConvLSTM with attention mechanism (DeepConvLSTM-A)
To our knowledge, no previous work exists, that couples DeepConvLSTM with the exact attention mechanism used in LSTM-A. Only after starting the work on this thesis, we found out that a similar approach had been tried by Singh et al. @singh_deep_2021. We instead tried to combine the two methods DeepConvLSTM and LSTM-A together, ending up with DeepConvLSTM-A. The attention mechanism is implemented exactly the same way as with the LSTM with attention mechanism by Zeng et al. @zeng_understanding_2018, and is therefore different from the one used by Singh et al.. The difference can be found in the way the attention mechanism is implemented, as mentioned in Section \ref{deepconvlstm_att}. Mainly, while Singh et al. base the attention weight calculation jointly on the hidden state values of all time steps, we utilize the comparison of the LSTM's hidden value in each time step to the last time step in the calculation. The methods are similar, but not identical, and we cannot make a statement about the performance implications of the different design choices.
Due to us finding out about the work of Singh et al. late, we did not add their version to the list of architectures we tried. The resulting architecture for our version of a DeepConvLSTM with attention mechanism is, as explained, different from theirs. In total, when using DeepConvLSTM-A, the data is first passed through the four convolutional layers with the same configuration as for DeepConvLSTM. Then it is passed through the LSTM, which only has one layer here, and then the hidden states generated over the series of time steps are combined with the weighted sum as in LSTM-A. Afterwards, these results are passed through the fully connected classification layer. The DeepConvLSTM-A model has around 230.000 parameters that need to be trained.
Due to us finding out about the work of Singh et al. late, we did not add their version to the list of architectures we tried. The resulting architecture for our version of a DeepConvLSTM with attention mechanism is, as explained, different from theirs. In total, when using DeepConvLSTM-A, the data is first passed through the four convolutional layers with the same configuration as for DeepConvLSTM. Then it is passed through the LSTM, which only has one layer here, and then the hidden states generated over the series of time steps are combined with the weighted sum as in LSTM-A. Afterwards, these results are passed through the fully connected classification layer. The DeepConvLSTM-A model has around 230.000 parameters that need to be optimized.
### Training routines and hyper parameter search
We trained all the models using PyTorch @paszke_pytorch_2019. The data was loaded from the matroska container format using a modified version of the PyAV library. It was then processed in NumPy @harris_array_2020 and converted to PyTorch tensors. The training of the neural networks took place on a single GTX 1070 graphics card by NVIDIA.
The training data was split into $150\,s$ long windows which were then shuffled before being used to train the models on the different paradigms. The shuffling is done in order for the mini-batch method to receive random batches, rather than a group of windows from the same temporal context, as well as to avoid overfitting the network to the order of the windows in the training data.
We trained all the models using PyTorch @paszke_pytorch_2019. The data was loaded from the matroska container format using a modified version of the PyAV library @scholl_pyav_2021. It was then processed in NumPy @harris_array_2020 and converted to PyTorch tensors. The training of the neural networks took place on a single GTX 1070 graphics card by NVIDIA.
The training data was split into $150\,$sample ($3\,s$) long windows which were then shuffled before being used to train the models on the different paradigms. The shuffling is done in order for the mini-batch method to receive random batches, rather than a group of windows from the same temporal context, as well as to avoid overfitting the network to the order of the windows in the training data.
#### Hyper parameter search
##### Batch size
In order to find the best batch size, sizes between 32 samples per batch and 1024 samples per batch were manually tested on the model classes. On the validation data, using 512 samples per batch yielded the best results, although not by a significant margin. Therefore we fixed the batch size to 512 for our experiments.
##### Learning rate
There is a connection between the batch size and the learning rate (lr). Increasing the batch size can have a similar effect as reducing the learning rate over time (learning rate decay) @smith_dont_2018. Since we use a comparatively big batch size for our model training, we experimented with smaller learning rate values. During preliminary testing on the validation set, different initial values from 0.01 to 0.00001 were tested. We fixed the initial learning rate to 0.0001, as this provided the best performance. We had implemented starting with a higher learning rate and then using learning rate decay but found out during preliminary testing on the validation set, that this approach did not improve the performance, in our case. We also found out that starting with higher learning rates ($lr > 0.01$) lead to numerical instability in the recurrent networks, producing NaN values for gradients and thus parameters. This means the training became unstable for the networks containing LSTM layers, hence the learning rate had to be reduced for these networks anyways.
There is a connection between the batch size and the learning rate. Increasing the batch size can have a similar effect as reducing the learning rate over time (learning rate decay) @smith_dont_2018. Since we use a comparatively big batch size for our model training, we experimented with smaller learning rate values. During preliminary testing on the validation set, different initial values from 0.01 to 0.00001 were tested. We fixed the initial learning rate to 0.0001, as this provided the best performance. We had implemented starting with a higher learning rate and then using learning rate decay but found out during preliminary testing on the validation set, that this approach did not improve the performance, in our case. We also found out that starting with higher learning rates ($lr > 0.01$) lead to numerical instability in the recurrent networks, producing NaN values for gradients and thus parameters. This means the training became unstable for the networks containing LSTM layers, hence the learning rate had to be reduced for these networks anyways.
##### Loss function
As loss function, we use the cross-entropy loss, weighted by the classes' frequencies ($\mathcal{L}_{weighted}$). This means that the loss function corrects for imbalanced classes, and we do not have to rely on sub sampling or repetition in order to balance the classes. The weighted cross entropy loss is defined as shown in equation \ref{eqn:cross_entropy_loss}. We first apply the "softmax" function to the models' output $\mathbf{x}$ (see equations \ref{eqn:softmax} and \ref{eqn:apply_softmax}). Then, the loss is calculated by applying the weighted cross-entropy loss function, with the weight of each class being the inverse of its relative frequency in the training set data. This way, the predictions for all classes have the same potential influence on the parameter updates, despite the classes not being perfectly balanced.
As loss function, we use the cross-entropy loss, weighted by the classes' frequencies ($\mathcal{L}_{weighted}$). This means that the loss function corrects for imbalanced classes, and we do not have to rely on sub sampling or repetition in order to balance the class frequencies in the data set. The weighted cross entropy loss is defined as shown in equation \ref{eqn:cross_entropy_loss}. We first apply the "softmax" function to the models' output $\mathbf{x}$ (see equations \ref{eqn:softmax} and \ref{eqn:apply_softmax}). Then, the loss is calculated by applying the weighted cross-entropy loss function, with the weight of each class being the inverse of its relative frequency in the training set data. This way, the predictions for all classes have the same potential influence on the parameter updates, despite the classes not being perfectly balanced.
\begin{figure}
\begin{align}
......@@ -217,22 +214,22 @@ Softmax(\mathbf{x})_j &= \frac{exp(\mathbf{x}_j)}{\sum_k exp(\mathbf{x}_k)} \\
\label{eqn:apply_softmax}
\end{align}
\begin{align}
\mathcal{L}(\mathbf{p}, \mathbf{y}) &= \sum_{i=1}^{N}\mathbf{y}_i\cdot log(\mathbf{p}_i)
\mathcal{L}(\mathbf{p}, \mathbf{y}) &= - \sum_{i=1}^{N}\mathbf{y}_i\cdot log(\mathbf{p}_i)
\end{align}
\begin{align}
\mathcal{L}_{weighted}(\mathbf{p}, \mathbf{y}) &= \frac{\sum_{i=1}^{N}\mathbf{y}_i\cdot log(\mathbf{p}_i) \cdot weight(class(\mathbf{y}_i))}{\sum_{i=1}^N weight(class(\mathbf{y}_i))}
\mathcal{L}_{weighted}(\mathbf{p}, \mathbf{y}) &= \frac{- \sum_{i=1}^{N}\mathbf{y}_i\cdot log(\mathbf{p}_i) \cdot weight(class(\mathbf{y}_i))}{\sum_{i=1}^N weight(class(\mathbf{y}_i))}
\label{eqn:cross_entropy_loss}
\end{align}
\end{figure}
The cross-entropy loss works well with classification tasks, which is why we relied on it for this work. It is the de-facto loss function in modern neural network based classification tasks @demirkaya_exploring_2020. The cross-entropy loss can be used for multiclass problems, but also works well with 2 class problems. For those binary classification tasks we use the binary cross-entropy loss.
The cross-entropy loss works well with classification tasks, which is why we relied on it for this work. It is the de-facto loss function in most modern neural network based classification tasks @demirkaya_exploring_2020. The cross-entropy loss can be used for multiclass problems, but also works well with 2 class problems. For those binary classification tasks we use the binary cross-entropy loss.
#### Dropout
Dropout is a method that helps to prevent neural networks from overfitting to the training set @srivastava_dropout_nodate. It works based on "dropping" random units of a neural network during the training with a given probability. The "dropped" units are simply set to 0. This forces the neural network to learn different paths and prevents the network from co-adapting its units too much @srivastava_dropout_nodate. Dropout can also be viewed as jointly training an ensemble of slightly different neural networks at once but is much more efficient at this task than an actual ensemble. Dropout is regularly used in the training of neural networks.
We applied dropout to all model classes in preliminary testing. On the validation data, dropout with $p=0.25$ was tested for all models. Out of all the models, only the fully connected network had an increased validation performance, whilst the other models did not. For this reason, dropout was only applied in the fully connected model.
#### Early stopping
We used early stopping, based on the split off validation set. Early stopping is a regularization technique @prechelt_early_1998, which is frequently employed during the training of neural networks. It helps to prevent overfitting to the training set, by stopping the training process early. In order to decide at which point in the process, i.e. after which epoch, the training should be stopped, we monitor the loss function on the validation set. The model is trained utilizing the training set, but as soon as the validation loss starts to rise, we can stop the training. This makes sense because we can assume that the increase of the validation loss is reflecting the unknown trend of the loss on the test set, which we cannot look at during train time. An example of this process can be seen in fig. \ref{fig:learning_curves}, which shows the comparison between training and validation losses over the progress of the training.
We used early stopping, based on the split-off validation set. Early stopping is a regularization technique @prechelt_early_1998, which is frequently employed during the training of neural networks. It helps to prevent overfitting to the training set, by stopping the training process early. In order to decide at which point in the process, i.e. after which epoch, the training should be stopped, we monitor the loss function on the validation set. The model is trained utilizing the training set, but as soon as the validation loss starts to rise, we can stop the training. This makes sense because we can assume that the increase of the validation loss is reflecting the unknown trend of the loss on the test set, which we cannot look at during train time. An example of this process can be seen in fig. \ref{fig:learning_curves}, which shows the comparison between training and validation losses over the progress of the training.
\begin{figure}[!h]
\centering
......@@ -257,7 +254,7 @@ In order to run a pre-trained neural network based model smart watches, we used
The course of action on the smart watch is shown in fig. \ref{fig:watch_flow}. The watch continuously records the data from the integrated IMU, to fill a buffer. To filter out the most basic idle case of "no movement", the neural network will only be run to classify the current activity, if at least one sensor value is higher than a certain threshold, $v_{idle}$ that is fixed inside the application. If there is enough movement to reach the threshold, a forward pass of the neural network model is done with the data from the last few seconds. It is possible to set the interval classified in each network pass to a value from $1$ second to $10$ seconds, but the model must be trained for the specific interval length, as mentioned above. Our windows had a length of 3 seconds (150 samples of the 6 sensor axes). The forward pass will then output class probabilities for each of the windows considered.
In order to avoid false positives and outliers, smoothing can be applied on the network outputs. The smoothing acts as a low-pass filter on the predictions, filtering out rapid changes in the output. In order to do this, we employ a threshold on the running mean of the last $n$ predictions, again over a fixed interval, e.g. 15 seconds. If the running mean reaches the threshold, the final prediction of the window will be "hand wash". We tried different interval sizes and thresholds on the validation sets for each of the models for each of the problems and report the projected performance results for the best thresholds found on the validation set in Section \ref{sec:results}.
In order to avoid false positives and outliers, smoothing can be applied on the network outputs. The smoothing acts as a low-pass filter on the predictions, filtering out rapid changes in the output. In order to do this, we employ a threshold on the running mean of the last $n$ predictions, over a fixed interval, e.g. 15 seconds. If the running mean reaches the threshold, the final prediction of the window will be "hand wash". We tried different interval sizes and thresholds on the validation sets for each of the models for each of the problems and report the projected performance results for the best thresholds found on the validation set in Section \ref{sec:results}. Note that the running mean threshold label smoothing we use should not be mistaken for the method proposed by Szegedy et al. in @szegedy_rethinking_2015, which is also referred to as label smoothing.
In the end, if the final prediction of the prediction pipeline is "hand wash", a notification can be sent to the user. The running mean filter is especially important to avoid sending out too many notifications for false positives. The notification will trigger a cooldown, i.e. a timer that makes sure, that the notification is not sent again for a certain amount of time. We used $45\,s$. Depending on the state of the study, the notification can be used for further data collection. The application can ask the user if the detection was correct, so that we can validate and annotate new data points.
......@@ -267,12 +264,12 @@ On the other hand in the future and with the correctly trained model, the notifi
## Evaluation
In order to evaluate the developed systems, different methods of evaluation are taken into account. We are able to run theoretical performance measures on the available pre-recorded hand washing data. We have to define fitting metrics and scores in order to get a meaningful estimate of the expected real-world performance.
On the other hand, a practical evaluation can be run with actual test users on the smart watch system using the application described above. We test the detection ability of our model in a controlled environment and for "real" everyday use. The results of both evaluations will be reported in Section \ref{sec:results}
On the other hand, a practical evaluation can be run with actual test users on the smart watch system using the application described above. We test the detection ability of our model in a controlled environment and for "real" everyday use. The results of both evaluations will be reported in Section \ref{sec:results}.
### Theoretical evaluation
We use the previously unseen sliding window sensor data $\mathbf{X}_{test}$ and the respective labels $\mathbf{y}_{test}$ as input to each of the models that we previously trained. From their respective outputs, we obtain the predictions $\mathbf{p}$. In order to evaluate how well each model performs on the unseen data we have to compare the predictions to the ground truth labels in a meaningful way, utilizing metrics to score each model's performance.
We use the previously unseen sliding window sensor data $\mathbf{X}_{test}$ as input and the respective labels $\mathbf{y}_{test}$ as ground truth to each of the models that we previously trained. From their respective outputs, we obtain the predictions $\mathbf{p}$. In order to evaluate how well each model performs on the unseen data we have to compare the predictions to the ground truth labels in a meaningful way, utilizing metrics to score each model's performance.
Different metrics can be used for a binary classification system. As the classes are usually highly unbalanced in our scenario of hand washing, the simple accuracy score is not sufficient. As most activities in the daily life of a user will be "no hand washing" an accuracy of far over $90\,\%$ could be reached by always predicting "no hand washing" without actually solving the problem of hand washing detection. The accuracy score does not take into account this disparity in class sizes and is thus not a meaningful metric for our problem formulation.
Different metrics can be used for a binary classification system. As the classes are usually highly unbalanced in our scenario of hand washing, the simple accuracy score is not sufficient. As most activities in the daily life of a user will be "no hand washing" an accuracy of over $90\,\%$ could be reached by always predicting "no hand washing" without actually solving the problem of hand washing detection. The accuracy score does not take into account this disparity in class sizes and is thus not a meaningful metric for our problem formulation.
We therefore use and report the following, more sophisticated, metrics:
......@@ -309,7 +306,7 @@ S\ score &= 2 \cdot \frac{Sensitivity \cdot Specificity}{Sensitivity + Specifici
\end {figure}
\label{s_score}
The sensitivity is the rate of positive samples that get correctly recognized. The specificity is the rate of negatives that get correctly recognized. If both these measures are close to 1, the model performs well. The precision is the ratio of true positives contained in all positive predictions. It is similar to the sensitivity but also punishes false positives to some extent. The recall is the same as the sensitivity. The harmonic mean of recall and precision is called F1 score and is also commonly used to evaluate binary prediction tasks. Since we especially need to balance specificity and sensitivity for our task, we also report the S score, which we define as the harmonic mean of specificity and sensitivity. One of the reasons for reporting the S score is the lack of false positive punishment in the F1 score formula. The F1 score does not punish false positives as much as needed in the task of compulsive hand washing detection. While it is partly included in the precision measure, if there are many positives in the ground truth, then the precision won't weigh false positives enough. Including the specificity in the measure therefore makes sure we do not lose track of the false positives, which would be annoying to the user, especially if we send out smart watch notifications with vibration or sound alerts.
The sensitivity is the ratio of positive samples that get correctly recognized. The specificity is the ratio of negatives that get correctly recognized. If both these measures are close to 1, the model performs well. The precision is the ratio of true positives contained in all positive predictions. It is similar to the sensitivity but also punishes false positives to some extent. The recall is the same as the sensitivity. The harmonic mean of recall and precision is called F1 score and is also commonly used to evaluate binary prediction tasks. Since we especially need to balance specificity and sensitivity for our task, we also report the S score, which we define as the harmonic mean of specificity and sensitivity. One of the reasons for reporting the S score is the lack of false positive punishment in the F1 score formula. The F1 score does not punish false positives as much as needed in the task of compulsive hand washing detection. While it is partly included in the precision measure, if there are many positives in the ground truth, then the precision won't weigh false positives enough. Including the specificity in the measure therefore makes sure we do not lose track of the false positives, which would be annoying to the user, especially if we send out smart watch notifications with vibration or sound alerts.
For the multiclass problem of distinguishing compulsive hand washing from normal hand washing from other activities, the binary metrics are not applicable. Here, we report normalized confusion matrices, and their mean diagonal values as one performance measure. The confusion matrix shows, which amount of samples belonging to a certain class (true labels, rows of the matrix) are predicted to belong to which other class (predicted labels, columns of the matrix). The normalized version of the confusion matrix replaces the total values by ratios in proportion to the number of true labels for each class. This means that for each true label row in the matrix, the values sum to 1.
The mean diagonal value of this matrix can be seen as a mean class accuracy score, as the diagonal values of the normalized confusion matrix are the accuracy values for each possible class.
......@@ -328,11 +325,13 @@ S\ score\ multi = \frac{1}{3}\cdot \sum_{i=0}^2 S\ score(\mathbf{C}_i)
We also report the metrics used for problems 1 on a binarized version of the third problem. To binarize the problem, we define "hand washing" as the positive class, and the remainder as negative class. Note that "hand washing" includes "compulsive hand washing". With this binarization, we can compare the models trained on the multiclass problem to the models trained on the initial binary problem. However, as problem 1 is a special case of problem 3, we expect the performance of the models trained for problem 3 to be lower than the ones trained for problem 1.
\label{chained_model}
Added to that, we also report the performance of the best two models for problem 1 and problem 2 chained and then tested on problem 3. This means we execute the best model for hand washing detection first, and then, for all sample windows that were detected as hand washing, we run the best model for the classification of compulsive hand washing vs non compulsive hand washing. From this chain, we can derive three-class predictions by counting all samples that were not detected by the first model as negatives (Null) and the ones predicted to be hand washing, but not predicted to be compulsive by the second model as hand washing (HW). The remaining samples are then classified to be compulsive hand washing (HW-C) by the chained model. This chained model could possibly perform better, as in theory they are two different models, which thus, in combination, have had more training time and possibly a higher capacity. However, the method of chaining two models would also take up more space in the memory and more computation time on the device, and thus be less efficient.
Added to that, we also report the performance of the best two models for problem 1 and problem 2 chained and then tested on problem 3. This means we execute the best model for hand washing detection first, and then, for all sample windows that were detected as hand washing, we run the best model for the classification of compulsive hand washing vs non compulsive hand washing. From this chain, we can derive three-class predictions by counting all samples that were not detected by the first model as negatives (Null) and the ones predicted to be hand washing, but not predicted to be compulsive by the second model as hand washing (HW). The remaining samples are then classified to be compulsive hand washing (HW-C) by the chained model. This chained model could possibly perform better, as it is the combination of two different models, which thus, have had more training time and possibly have a higher capacity. However, the method of chaining two models would also take up more space in the memory and more computation time on the device, and thus be less efficient.
### Practical evaluation
For the practical evaluation, we asked 5 subjects to test the system in practice. We defined two different paradigms, one for real-world performance evaluation and one for explicit evaluation of the model running on the smart watch. In order to do this, the model with the best performance on the test set of task 1., i.e. the general detection of hand washing, was exported to be executed on the watch inside the described smart watch application. We limited the testing to these scenarios because we did not have access to subjects that would actually wash their hands compulsively. The scenarios were:
For the practical evaluation, we asked 5 subjects to test the system in practice. We defined two different paradigms, one for real-world performance evaluation and one for explicit evaluation of the model running on the smart watch. In order to do this, the model with the best performance on the test set of task 1., i.e. the general detection of hand washing, was exported to be executed on the watch inside the described smart watch application. We limited the testing to these scenarios because we did not have access to subjects that would actually wash their hands compulsively.
The scenarios were:
1. The subjects are wearing a smart watch for one day. During this time, whenever they wash their hands, the watch will or will not detect the hand washing procedure. The subjects note down, whether or not the hand washing was recognized correctly, how long the washing procedure was and how intense it was on a scale of 1 to 5. Added to that, whenever there is a false prediction of hand washing, they note down their current activity.
......
......@@ -6,24 +6,24 @@ Automatically detecting the current activity of a human being is a wide research
In the area of gesture recognition, we try to detect and classify specific, and narrowly defined gestures.
The defined gestures can e.g. be used to actively control a system @saini_human_2020. This kind of approach is not directly applicable to our task of detecting hand washing. However, it could be possible to adapt algorithms from this field to the detection of a new gesture or a new set of gestures related to hand washing.
There are camera-based approaches and physical measurement-based approaches @saini_human_2020. The camera-based approaches were out of scope for this work. As explained in the introduction, in our setting, wrist worn devices have significant advantages over camera-based solutions that would have to be stationary, i.e. in fixed locations.
There also exist approaches based on inertial measurement sensors. These sensors measure movement related physical values, such as the force or acceleration, angular velocity or orientation in space.
There are camera-based approaches and physical measurement-based approaches @saini_human_2020. The camera-based approaches were out of scope for this work. As explained in the introduction, in our setting, wrist-worn devices have significant advantages over camera-based solutions that would have to be stationary, i.e. in fixed locations.
There also exist approaches based on inertial measurement sensors. These sensors measure movement related physical values, such as the acceleration, the angular velocity or the orientation in space.
Gesture recognition, in general, uses similar methods as the more difficult human activity recognition @saini_human_2020, which will be explained below.
## Human activity recognition
\label{section:har}
Recognizing more than one gesture or body movement in combination in a temporal context and deriving the current activity of the user is called human activity recognition (HAR). In this task, we want to detect more general activities, compared to the shorter and simpler gestures. An activity can include many distinguishable gestures. However, the same activity will not always include all of the same gestures and the gestures included could be in a different order for every repetition. Activities are less repetitive than gestures, and harder to detect in general @zhu_wearable_2011. However, Zhu et al. have shown that the combined detection of multiple different gestures can be used in HAR tasks too @zhu_wearable_2011, which makes sense, because a human activity can consist of many gestures. Nevertheless, most methods used for HAR consist of more direct applications of machine learning to the data, without the detour of detecting specific gestures contained in the execution of an activity.
Recognizing more than one gesture or body movement in combination in a temporal context and deriving the current activity of the user is called human activity recognition (HAR). In this task, we want to detect more general activities, compared to the shorter and simpler gestures. An activity can include many distinguishable gestures. However, the same activity will not always include all of the same gestures and the gestures that are implicitly included could be in a different order for every repetition. Activities are less repetitive than gestures, and harder to detect in general @zhu_wearable_2011. However, Zhu et al. have shown that the combined detection of multiple different gestures can be used in HAR tasks too @zhu_wearable_2011, which makes sense, because a human activity can consist of many gestures. Nevertheless, most methods used for HAR consist of more direct applications of machine learning to the data, without the detour of detecting specific gestures contained in the execution of an activity.
Methods used in HAR include classical machine learning methods as well as deep learning @liu_overview_2021 @bulling_tutorial_2014. The classical machine learning methods rely on features of the data obtained by feature engineering. The required feature engineering is the creation of meaningful statistics or calculations based on the time frame for which the activity should be predicted. The features can be frequency-domain-based and time-domain-based, but usually both are used at the same time to train these conventional models @liu_overview_2021. The classical machine learning methods include but are not limited to Random Forests (RFC), Hidden Markov Models (HMM), Support Vector Machines (SVM), the $k$-nearest neighbors algorithm and more.
#### Deep neural networks
Recently, deep neural networks have taken over the role of the state-of-the-art machine learning method in the area of human activity recognition @bock_improving_2021, @liu_overview_2021. Deep neural networks are universal function approximators @bishop_pattern_2006 and are known for being easy to use on "raw" data. They are "artificial neural networks" consisting of multiple layers, where each layer contains a certain number of nodes that are connected to the nodes of the following layer. The connections are each assigned a weight, and the weighted sum over the values of all the previous connected nodes is used to calculate the value of a node in the next layer. Simple neural networks where all nodes of a layer are connected to all nodes in the following layer are often called "fully connected neural networks" (FC-NN or FC).
The connections' parameters are optimized using forward passes through the network of nodes, followed by the execution of the backpropagation algorithm, and an optimization step. We can accumulate all the gradients with regard to a loss function for each of the parameters and for a small subset of the data passed and perform "stochastic gradient decent" (SGD). SGD or alternative similar optimization methods like the commonly used ADAM @kingma_adam_2017 optimizer perform a parameter update step. After many such updates and if the training works well, the network parameters will have been updated to values that lead to a lower value of the loss function for the training data. However, there is no guarantee of convergence whatsoever. As mentioned above, deep neural networks can, in theory, be used to approximate arbitrary functions. Nevertheless, the parameters for the perfect approximation cannot be easily found, and empirical testing has revealed that neural networks do need a lot of training data in order to perform well, compared to classical machine learning methods. In return, with enough data, deep neural networks often outperform classical machine learning methods.
The connections' parameters are optimized using forward passes through the network of nodes, followed by the execution of the backpropagation algorithm, and an optimization step. We can accumulate all the gradients with regard to a loss function for each of the parameters and for a small subset of the data (mini-batch) passed and perform "stochastic gradient decent" (SGD). SGD or alternative similar optimization methods like the commonly used ADAM @kingma_adam_2017 optimizer perform a parameter update step. After many such updates and if the training works well, the network parameters will have been updated to values that lead to a lower value of the loss function for the training data. However, there is no guarantee of convergence whatsoever. As mentioned above, deep neural networks can, in theory, be used to approximate arbitrary functions. Nevertheless, the parameters for the perfect approximation cannot be easily found, and empirical testing has revealed that neural networks do need a lot of training data in order to perform well, compared to classical machine learning methods. In return, with enough data, deep neural networks often outperform classical machine learning methods.
###### Convolutional neural networks (CNNs)
are neural networks that are not fully connected, but work by using convolutions with a kernel, that we slide over the input. CNNs were first introduced for hand-written character recognition @lecun_backpropagation_1989 @le_cun_handwritten_1990 (1989, 1990), but were later revived for computer vision tasks @krizhevsky_imagenet_2012 (2012), after more computational power was available on modern devices to train them. Since the rise of CNNs in computer vision, most computer vision problems are solved with their help. The convolutions work by moving filter windows with learnable parameters (also called kernels) over the input @albawi_understanding_2017. Opposed to a fully connected network, the weights are shared over many of the nodes, because the same filters are applied over the full size of the input. CNNs have less parameters to train than a fully connected network with the same number of nodes, which makes them easier to train. They are generally expected to perform better than FC networks, especially on image related tasks. The filters can be 2-dimensional (2d), like for images (e.g. a 5x5 filter moved across the two axes of an image) or 1-dimensional (1d), which can e.g. be used to slide a kernel along the time dimension of a sensor recording. Even in the 1-dimensional case, less parameters are needed compared to the application of a fully connected network. Thus, the 1-dimensional CNN is expected to be easier to train and achieve a better performance.
are neural networks that are not fully connected, but work by using convolutions with a kernel, that we slide over the input. CNNs were first introduced for hand-written character recognition @lecun_backpropagation_1989 @le_cun_handwritten_1990 (1989, 1990), but were later revived for computer vision tasks @krizhevsky_imagenet_2012 (2012), after more computational power was available on modern devices to train them. Since the rise of CNNs in computer vision, most computer vision problems are solved with their help. The convolutions work by moving filter windows with learnable parameters (also called kernels) over the input @albawi_understanding_2017. Opposed to a fully connected network, the weights are shared over many of the nodes, because the same filters are applied over the full size of the input. CNNs have less parameters to train than a fully connected network with the same number of nodes, which makes them easier to train. They are generally expected to perform better than FC networks, especially on image related tasks. The filters can be 2-dimensional (2d), like for images (e.g. a 5x5 filter moved across the two axes of an image) or 1-dimensional (1d), which can e.g. be used to slide a kernel along the time dimension of a sensor recording. Even in the 1-dimensional case, less parameters are needed compared to the application of a fully connected network. Thus, the 1-dimensional CNN is expected to be easier to train and achieve a better performance on time series sensor data than a fully connected network.
###### Recurrent neural networks (RNNs)
......@@ -52,14 +52,17 @@ The inputs to the cell are the external inputs $\mathbf{x}_t$ (from the previous
\caption*{($\odot$ marking element-wise multiplication)}
\end{figure}
![LSTM Cell, by Guillaume Chevalier CC BY-SA 4.0, with added labeling for the gates)](img/LSTM_Cell.png){#fig:lstm_cell width=85%}
\begin{minipage}{\textwidth}
The four LSTMs' gates are:
\begin{itemize}
\item forget gate
\item new memory gate
\item input gate
\item output gate
\end{itemize}
\end{minipage}
- forget gate
- new memory gate
- input gate
- output gate
![LSTM Cell, by Guillaume Chevalier CC BY-SA 4.0, with added labeling for the gates)](img/LSTM_Cell.png){#fig:lstm_cell width=85%}
These gates are fully connected neural network layers (marked in orange and with the corresponding activation functions in @fig:lstm_cell) with respective weights and biases and serve a functionality from which their names are derived. The weights and biases must be learned during the training phase of the neural network. The forget gate allows the LSTM to only apply part of the "remembered" cell memory $\mathbf{c}_{t-1}$ in the current step, i.e. which bits should be used to which extent with regard to the current new input data $\mathbf{x}_t$ and the hidden state from the last time step $\mathbf{h}_{t-1}$. The output of the forget gate, $\mathbf{f}_t$, multiplied element-wise with $\mathbf{c}_{t-1}$ is considered the "remembered" information from the last step. The new memory gate and the input gate are used to decide which new data is added to the cell state. These two layers are also given the previous step's hidden state $\mathbf{h}_{t-1}$ and the current step's input $\mathbf{x}_t$. In combination, the new memory network output $\tilde{\mathbf{c}}_t$ and the input gates' output $\mathbf{i}_t$ decide which components of the current input and hidden state will be taken into the new memory state $\mathbf{c}_{t}$. The memory state is passed on to the next step. The output gate will generate $\mathbf{o}_t$, which will be combined with $tanh(\mathbf{c}_{t})$ by element-wise matrix multiplication to form the new hidden state $\mathbf{h}_{t}$.
......@@ -90,7 +93,7 @@ Note that the calculation of $\alpha_t$ is done with the softmax function as sho
Zeng et al. evaluate their approach on 3 data sets and report a state-of-the-art performance, beating the initial DeepConvLSTM.
\label{deepconvlstm_att}
Another study by Singh et al. combines DeepConvLSTM with a self-attention mechanism @singh_deep_2021. The attention mechanism is very similar to the one used by Zeng et al. @zeng_understanding_2018, where the mechanism consists of a layer that follows the LSTM layers in the DeepConvLSTM network. Instead of utilizing a score layer which uses the relation of each $h_t$ to $h_T$, Singh et al. find the weights $\mathbf{\alpha}$ by applying the softmax function to the output of a fully connected layer through which they pass the concatenated $h_t$ values. Instead of considering only the relations of each $h_t$ to $h_T$ separately, they use one layer to jointly calculate all the attention weights. Other than that, the two attention mechanisms are similar. Singh et al. also report a statistically significant increase in performance compared to the initial DeepConvLSTM, although the evaluate their approach on different data sets than Zeng et al..
Another study by Singh et al. combines DeepConvLSTM with a self-attention mechanism @singh_deep_2021. The attention mechanism is very similar to the one used by Zeng et al. @zeng_understanding_2018, where the mechanism consists of a layer that follows the LSTM layers in the DeepConvLSTM network. Instead of utilizing a score layer which uses the relation of each $h_t$ to $h_T$, Singh et al. find the weights $\mathbf{\alpha}$ for the same weighted sum by applying the softmax function to the output of a fully connected layer through which they pass the concatenated $h_t$ values. Instead of considering only the relations of each $h_t$ to $h_T$ separately, they use one layer to jointly calculate all the attention weights. Other than that, the two attention mechanisms are similar. Singh et al. also report a statistically significant increase in performance compared to the initial DeepConvLSTM, although the evaluate their approach on different data sets than Zeng et al..
For HAR, DeepConvLSTM and the models derived from it are the state-of-the-art machine learning methods, as they consistently outperform other model architectures on the available benchmarks and data sets.
......@@ -107,7 +110,7 @@ In order to separate hand washing from other activities, Mondol et al. employ a
![Steps of HAWAD for parameter estimation and inference, taken from @sayeed_mondol_hawad_2020](img/HAWAD_filter.png){width=98% #fig:HAWAD}
They use the said features of all positive class samples to calculate the mean $\boldsymbol{\mu}$ and covariance matrix $\mathbf{S}$ of the feature distribution. Based on these measures, one can compute each sample's distance to the distribution using the Mahalanobis distance (as seen in equation \ref{eqn:mahala}). If during test time, the model predicts a sample to belong to the positive class, the distance is calculated. If the distance is bigger than a threshold ($d_{th}$), the sample is classified as a negative. The threshold $d_{th}$ can be derived by selecting it fittingly in order to include almost all positive samples seen during training. The parameter estimation and hand washing steps performed in the HAWAD paper can be seen in @fig:HAWAD. On their own data set (HAWAD data set) they reach F1-Scores of over 90% for hand washing detection.
They use the said features of all positive class samples to calculate the estimated mean $\boldsymbol{\hat{\mu}}$ and covariance matrix $\mathbf{\hat{\sum}}$ of the feature distribution. Based on these measures, one can compute each sample's distance to the distribution using the Mahalanobis distance (as seen in equation \ref{eqn:mahala}). If during test time, the model predicts a sample to belong to the positive class, the distance is calculated. If the distance is bigger than a threshold ($d_{th}$), the sample is classified as a negative. The threshold $d_{th}$ can be derived by selecting it fittingly in order to include almost all positive samples seen during training. The parameter estimation and hand washing steps performed in the HAWAD paper can be seen in @fig:HAWAD. On their own data set (HAWAD data set) they reach F1-Scores of over 90% for hand washing detection.
\begin{figure}
\begin{align}
......
......@@ -3,7 +3,7 @@
\label{sec:results}
This chapter will report the evaluation results from both the theoretical evaluation and the practical evaluation.
## Theoretical Evaluation
## Theoretical evaluation
For the theoretical evaluation, we report the results separately, split by the tasks 1.-3. described in Section \ref{sec:classification_problems}
In all tables of this chapter, the best values for a specific metric will be highlighted in bold font.
......@@ -27,7 +27,6 @@ The models running on normalized data also profit from the label smoothing, howe
For the special case of the models initially trained on problem 3 which were then binarized and run on problem 1 (without smoothing), we only report some results in this section. The full results can be found in the appendix. Surprisingly, the models trained on problem 3 reach similar F1 scores on the test data of problem 1 as the models trained on problem 1. DeepConvLSTM achieves an F1 score of $0.857$, DeepConvLSTM-A achieves $0.847$. The F1 score of DeepConvLSTM is even higher than the highest F1 score of the models trained for problem 1 by $0.004$. However, for the S score metric, the models trained for problem 3 can only reach up to $0.704$ (CNN) or $0.671$ (DeepConvLSTM-A), which is lower by $0.052$ than the best performing model trained for problem 1.
\FloatBarrier
### Distinguishing compulsive hand washing from non-compulsive hand washing
The results without smoothing of predictions for the second task, distinguishing compulsive hand washing from non-compulsive hand washing can be seen in table \ref{tbl:only_conv_hw}. In @fig:p2_metrics, the results with and without smoothing are shown. In terms of the F1 score metric, the LSTM model performs best ($0.926$). It is closely followed by DeepConvLSTM-A ($0.922$) and DeepConvLSTM ($0.918$). However, the RFC also performs surprisingly well, with an F1 score of $0.891$, even beating the CNN ($0.883$) and FC networks ($0.886$). Due to the imbalance of classes in the test set ($70.6\,\%$ of samples correspond to the positive class), the majority classifier reaches an F1 score of $0.828$. The S score is best for DeepConvLSTM ($0.869$) and LSTM ($0.862$), followed by LSTM-A ($0.848$) and DeepConvLSTM-A ($0.846$). The baseline methods RFC ($0.734$) and SVM ($0.701$) fail to reach similar S scores as the neural network based methods.
......@@ -60,20 +59,36 @@ The confusion matrices of the non-normalized models in the right column do not a
As for problem 1 and for problem 2, we obtain the result, that normalization seems to decrease the performance of all the neural network based classifiers. For this problem, the FC network also has a decreased performance when normalized input data is used.
![Confusion matrices for all baseline classifiers with and without normalization of the sensor data](img/confusion_baselines.pdf){#fig:confusion_baselines width=98%}
The respective confusion matrices for the baseline classifiers, i.e. RFC, SVM, majority classifier and random classifier are displayed in @fig:confusion_baselines. For both SVM and RFC, and both the normalized and the non-normalized versions thereof, the confusion matrices show, that the Null class was predicted most. In the non-normalized version, $94\,\%$ of samples belonging to the Null class are predicted correctly by both of these methods. However, they also predict most of the samples belonging to the other classes as Null. The HW class is detected to be Null in $71\,\%$ (SVM) and $67\,\%$ of its samples, with only $15\,\%$ (SVM) and $22\,\%$ (RFC) being identified correctly. The accuracy is better for the HW-C class, where a ratio of correct predictions of $0.42$ (SVM) and $0.43$ (RFC) is reached, although there is again a high amount of misclassifications into the Null class (SVM: $0.56$, RFC: $0.54$).
The respective confusion matrices for the baseline classifiers, i.e. RFC, SVM, majority classifier and random classifier are displayed in @fig:confusion_baselines. For both SVM and RFC, and both the normalized and the non-normalized versions thereof, the confusion matrices show, that the Null class was predicted most. In the non-normalized version, $94\,\%$ of samples belonging to the Null class are predicted correctly by both of these methods. However, they also predict most of the samples belonging to the other classes as Null. The HW class is detected to be Null in $71\,\%$ (SVM) and $67\,\%$ (RFC) of its samples, with only $15\,\%$ (SVM) and $22\,\%$ (RFC) being identified correctly. The accuracy is better for the HW-C class, where a ratio of correct predictions of $0.42$ (SVM) and $0.43$ (RFC) is reached, although there is again a high amount of misclassifications into the Null class (SVM: $0.56$, RFC: $0.54$).
The majority classifier classifies all the samples into the Null class, which leads to its accuracy on the samples belonging to the Null class being $1.0$ and $0.0$ for all other samples belonging to the HW and HW-C classes.
The random classifier also does not perform well and reaches values around $0.33$ for each of the fields of the confusion matrix.
The random classifier also does not perform well and reaches values around $0.33$ for each of the fields of the confusion matrix.
\newpage
\begin{figure}[H]
\centering
\includegraphics[width=0.98\textwidth]{img/confusion_baselines.pdf}
\caption{Confusion matrices for all baseline classifiers with and without normalization of the sensor data}
\label{fig:confusion_baselines}
\end{figure}
![Confusion matrix of the chained best classifiers for problem 1 (DeepConvLSTM-A) and problem 2 (DeepConvLSTM), applied to problem 3](img/chained_confusion.pdf){#fig:chained_confusion width=60%}