Commit fe6c40df authored by burcharr's avatar burcharr 💬
Browse files

automatic writing commit ...

parent b8e20a88
......@@ -897,7 +897,7 @@ Type: article},
booktitle = {2016 {IEEE} International Conference on Pervasive Computing and Communications ({PerCom})},
author = {Sztyler, Timo and Stuckenschmidt, Heiner},
date = {2016-03},
keywords = {Acceleration, Biomedical monitoring, Context, Feature extraction, Gravity, Performance evaluation, Sensors},
keywords = {Sensors, Acceleration, Biomedical monitoring, Context, Feature extraction, Gravity, Performance evaluation},
file = {IEEE Xplore Full Text PDF:/home/robin/Zotero/storage/8A7UCP3G/Sztyler and Stuckenschmidt - 2016 - On-body localization of wearable devices An inves.pdf:application/pdf;IEEE Xplore Abstract Record:/home/robin/Zotero/storage/ZA3NXXR7/7456521.html:text/html},
}
......@@ -929,6 +929,33 @@ Type: article},
author = {Reiss, Attila and Stricker, Didier},
date = {2012-06},
note = {{ISSN}: 2376-8541},
keywords = {Accuracy, Benchmark testing, Biomedical monitoring, Decision trees, Heart rate, Monitoring, Standards},
keywords = {Accuracy, Biomedical monitoring, Benchmark testing, Decision trees, Heart rate, Monitoring, Standards},
file = {IEEE Xplore Full Text PDF:/home/robin/Zotero/storage/2SGLX27L/Reiss and Stricker - 2012 - Introducing a New Benchmarked Dataset for Activity.pdf:application/pdf;IEEE Xplore Abstract Record:/home/robin/Zotero/storage/FC9A9P9I/6246152.html:text/html},
}
@article{harris_array_2020,
title = {Array programming with {NumPy}},
volume = {585},
rights = {2020 The Author(s)},
issn = {1476-4687},
url = {https://www.nature.com/articles/s41586-020-2649-2},
doi = {10.1038/s41586-020-2649-2},
abstract = {Array programming provides a powerful, compact and expressive syntax for accessing, manipulating and operating on data in vectors, matrices and higher-dimensional arrays. {NumPy} is the primary array programming library for the Python language. It has an essential role in research analysis pipelines in fields as diverse as physics, chemistry, astronomy, geoscience, biology, psychology, materials science, engineering, finance and economics. For example, in astronomy, {NumPy} was an important part of the software stack used in the discovery of gravitational waves1 and in the first imaging of a black hole2. Here we review how a few fundamental array concepts lead to a simple and powerful programming paradigm for organizing, exploring and analysing scientific data. {NumPy} is the foundation upon which the scientific Python ecosystem is constructed. It is so pervasive that several projects, targeting audiences with specialized needs, have developed their own {NumPy}-like interfaces and array objects. Owing to its central position in the ecosystem, {NumPy} increasingly acts as an interoperability layer between such array computation libraries and, together with its application programming interface ({API}), provides a flexible framework to support the next decade of scientific and industrial analysis.},
pages = {357--362},
number = {7825},
journaltitle = {Nature},
author = {Harris, Charles R. and Millman, K. Jarrod and van der Walt, Stéfan J. and Gommers, Ralf and Virtanen, Pauli and Cournapeau, David and Wieser, Eric and Taylor, Julian and Berg, Sebastian and Smith, Nathaniel J. and Kern, Robert and Picus, Matti and Hoyer, Stephan and van Kerkwijk, Marten H. and Brett, Matthew and Haldane, Allan and del Río, Jaime Fernández and Wiebe, Mark and Peterson, Pearu and Gérard-Marchant, Pierre and Sheppard, Kevin and Reddy, Tyler and Weckesser, Warren and Abbasi, Hameer and Gohlke, Christoph and Oliphant, Travis E.},
urldate = {2021-10-26},
date = {2020-09},
langid = {english},
note = {Bandiera\_abtest: a
Cc\_license\_type: cc\_by
Cg\_type: Nature Research Journals
Number: 7825
Primary\_atype: Reviews
Publisher: Nature Publishing Group
Subject\_term: Computational neuroscience;Computational science;Computer science;Software;Solar physics
Subject\_term\_id: computational-neuroscience;computational-science;computer-science;software;solar-physics},
keywords = {Computational neuroscience, Computational science, Computer science, Software, Solar physics},
file = {Full Text PDF:/home/robin/Zotero/storage/FFD42GPJ/Harris et al. - 2020 - Array programming with NumPy.pdf:application/pdf;Snapshot:/home/robin/Zotero/storage/9TW8BU97/s41586-020-2649-2.html:text/html},
}
\ No newline at end of file
......@@ -4,18 +4,58 @@
## Discussion of Results
In this section, the results of our hand washing detection system evaluation will be discussed. We summarize the conclusions we draw from these results and give an overview over limitations and possible future improvements of our approach.
### Theoretical evaluation
The results of the theoretical evaluation show, that for each of the defined problems, the neural network based methods can learn to classify the desired different activities with high accuracy. However, there are differences in the difficulty of the problems, and the resulting F1 scores and S scores are not yet perfect, which means that there could be room for improvement.
##### Problem 1
For the problem of classifying hand washing and separating it from all other activities, the raw predictions of the networks without smoothing reached an F1 score of $0.853$ (DeepConvLSTM) and an S score of $0.758$ (DeepConvLSTM-A). The DeepConvLSTM and DeepConvLSTM-A surpass all the other models that we tested, including the baselines RFC, SVM, majority classifier and random classifier. The baselines are surpassed by large margins. This is in line with related work on other human activity recognition tasks, where DeepConvLSTM and DeepConvLSTM with small modifications also achieved the best results. On this specific problem, the CNN model also needs to be mentioned, because its performance was worse, but not far from the DeepConvLSTM based models.
The application of smoothing improved the performance of the models even further, to an F1 score of $0.892$ (DeepConvLSTM) and an S scores of $0.819$ (DeepConvLSTM-A). This performance boost by smoothing can be explained by the temporal context captured in the data. It is clear, that if many windows in rapid succession are classified as hand washing, it is likely that a small amount of predictions of the Null class is likely to be wrong. The smoothing helps to both filter out false positives and false negatives.
Normalization was shown to be ineffective for our approach, worsening the performance of almost all models. This could be due to the difference in distribution in the train and test set. The parameters for normalization were estimated from the train set and applied to the test set, which can always be inaccurate, because we assume that train and test set have the same distributions. This was not the case here, which is probably why the normalized data was harder to learn and test on, than the non normalized data.
For the reasons explained in section \ref{s_score}, we weigh the results of the S score higher than the ones of the F1 score. Thus, the best network for problem 1 is DeepConvLSTM-A, although only by a slight margin. The overall achieved S score of $0.819$ is based on reaching a specificity of $0.751$ and a sensitivity of $0.90$, which means that $90\,\%$ of windows containing hand washing were classified as hand washing correctly. However, $75.1\,\%$ of windows classified as Null really contained no hand washing, which leaves some room for improvement, because this means that the model still has a false positive rate of $24.9\,\%$.
Compared to results obtained by Mondol et al. in HAWAD @sayeed_mondol_hawad_2020, with F1 scores over $90\,\%$, it may look like our approach is weaker. Their detection of samples, that are out of distribution sounds like a good idea in theory. However, we must argue that their results and our results are not entirely comparable, because we did not train or evaluate on the same data. From what they report in their paper, they did not split the data by subjects, but rather by data windows, with random sampling. This means that, during training, their model saw data from subjects that they later tested on. Although this is not technically a leak, our version of splitting by subjects can be expected to deliver a better estimate of generalization performance, because our models (over-)adaptation to the specific subjects styles or patterns of hand washing cannot yield a performance boost on unseen subjects. Nevertheless, the detection of out of distribution samples could possibly increase the performance of our models. Still, one has to keep in mind, that a sample being out of distribution does not always mean, that it cannot be hand washing, especially if we test on unseen subjects that might arguably employ different patterns of motion. For these reasons the comparability of the results seems rather low, with the performance of HAWAD likely being overestimated in comparison to our scenario.
##### Problem 2
The problem of classifying compulsive hand washing and distinguishing it from non-compulsive hand washing seems to be more difficult than problem 1 from an outsiders perspective. Distinguishing different types of hand washing should be more difficult than telling apart hand washing from all other activities. However, the results of problem 2 seem to be proving the opposite, as significantly higher F1 scores and S scores are reached. For the raw predictions, F1 scores of around $0.92$ are reached by the LSTM, and DeepConvLSTM(-A). An S score of $0.869$ is reached by DeepConvLSTM. The classic machine learning methods SVM and RFC also reach good F1 scores near $0.89$, but significantly lower S scores below $0.735$. The different metrics' scores reached without smoothing are both higher than the scores reached with smoothing for problem 1, indicating that classifying compulsive hand washing and distinguishing it from non-compulsive hand washing could be learned better than classifying hand washing and separating it from all other activities. This property could stem from the much smaller amount of data used for problem 2, as we only included hand washing data here, which all stem from our own data sets, rather than from the external ones. The heterogeneity of the data set for problem 1 and problem 2 probably makes the network training harder. The imbalance of the data used ($65\,\%$ positive samples, $35\,\%$ negative samples) is also smaller than the one of problem 1. However it is unsure, if this had an effect on the performance of the neural network based methods, as we used the weighted loss function to combat this problem.
Like for problem 1, normalizing the data did not lead to a better performance, but rather to a decrease of the performance. The reasons are assumed to be the same reasons as for problem 1.
The results of problem 2 with the application of smoothing look even more promising. A very high F1 score of $0.966$ and an S score of $0.911$ are reached by DeepConvLSTM-A. The sensitivity reaches a value of $0.997$, while the specificity is $0.839$. Added to that, DeepConvLSTM and LSTM, as well as LSTM-A are all able to reach very similar performance levels. DeepConvLSTM-A has the highest performance values, but because it is so close, the difference is insignificant. The performance is even better than the performance without smoothing and hints that the detection of compulsive hand washing in a hand washing scenario is actually feasible. However, at this point we must stress again that the compulsive hand washing data used in this work is actually only simulated. Although the simulation was supervised with the help of expert psychologists, there is no guarantee, that real compulsive hand washing is distinguishable with the same level of performance. Arguably, even if the real difference from hand washing to compulsive hand washing was lower than in our data, the performance could still be high enough for a satisfactory separation of hand washing and compulsive hand washing. Is is likely that once there is enough data from OCD patients available, the models mentioned could be trained to detect non simulated compulsive washing with high accuracy.
As there is no published previous work in the area of automatically detecting compulsive hand washing, the results cannot be compared to already achieved results. The strong performance levels indicate a high probability for the approach being applicable in real world testing. Sadly, as our work's real world evaluation was limited to the evaluation of the best model for problem 1, we cannot report real world results to prove this hypothesis.
##### Problem 3
The problem of classifying hand washing and compulsive hand washing separately and distinguishing both from other activities at the same time is arguably harder than the other two problems. Problem 3 can be seen as the unification of problem 1 and problem 2, namely classifying whether and activity is hand washing (problem 1) and, if yes, whether said activity is compulsive hand washing (problem 2). By being this 3 class classification problem, problem 3 is thus more difficult and has more room for errors than the other two problems. Thus, a lower level of performance must be expected.
Out of the models distinctively trained on problem 3, DeepConvLSTM-A performed best with a multiclass F1 Score of $0.692$, a multiclass S score of $0.769$ and a mean diagonal value of the confusion matrix $0.712$. DeepConvLSTM achieved a slightly lower, but nearly as good performance. For this problem, the baseline classic machine learning methods performed much worse, with their multiclass F1 and S scores, as well as their mean diagonal values of the confusion matrix being in the range of around $0.5$.
Added to the models trained on problem 3, we also report the performance of a chained model, that consists of the two best performing models for problem 1 and problem 2. Due to problem 3 being the combination of problem 1 and problem 2, the chained model can be used to make the same predictions. The chained model we used was DeepConvLSTM-A from problem 1 and DeepConvLSTM from problem 2, as those were the best performing models in terms of non smoothed predictions and the S score for these two problems. The chained model reached an even higher performance, with a multiclass F1 Score of $0.714$, a multiclass S score of $0.783$ and a mean diagonal value of the confusion matrix $0.718$. This result is valuable, because it shows that the classifiers trained for problem 1 and problem 2 can outperform a classifier specifically trained for problem 3. This indicates, that the sub-problems of problem 3 can more easily be solved independently, than it is to solve problem 3 directly. The downside of using the two networks is, that they take twice the time to train, twice the time and energy to run and twice the memory or storage and thus are less efficient. Especially on the smart watch or any embedded mobile device the models could be deployed on, this could be a big disadvantage compared to the single model trained for problem 3. The performance difference is significant, but not by a large margin, the difference of $0.03$ in the multiclass S score could well be indistinguishable for real world users.
We did not apply smoothing for problem 3, but it could be done in theory, using a slightly different approach, and it may also improve the performance of the system.
Because of the preliminary results, we did not apply normalization to the chained model, but we tested all the other models for problem 3 using it, and got results matching the ones for the other 2 problems. Normalization did not help. We assume the same reasons as mentioned in the discussion of problem 1.
To conclude the results of problem 3, the overall performance of this more difficult problem is worse than the performances for problem 1 and problem 2. However, if we are willing to trade-off efficiency versus performance in the metrics used, a chained model consisting of two models trained on the sub-problems problem 1 and problem 2 can be applied to receive a performance boost.
todo: talk about the takeaways for each approach, which network is the best etc.
todo: normalization
### Practical applicability
real world evaluation
## Limitations of our Approach
## Limitations of our approach
##### Problem 2
The general performance of our models on problem 2 was high. However, one limitation of the results is, that we cannot measure its performance in distinguishing compulsive hand washing from other activities than non compulsive hand washing. However, our results could be employed together with other tools that give the knowledge about the user currently washing their hands.
## Comparison of goals to results
## Future work
todo:hpo, architecture (batch norm), data (more+ real ocd),
......@@ -25,5 +65,5 @@ todo:real world <- depending on results
## Conclusion
# Conclusion
todo:S'isch super!
\ No newline at end of file
......@@ -32,12 +32,10 @@ Inertial measurement units (IMUs) can measure different types of time series mov
## Goals
% todo fill a little more detail
In this work, we want to develop a method for the real time detection of hand washing and compulsive hand washing. We also want to test the method and report meaningful statistics of its success. Further, we want to test parts of the developed method in a real world scenario.
In this work, we want to develop a method for the real time detection of hand washing and compulsive hand washing. We also want to test the method and report meaningful statistics of its success. Further, we want to test parts of the developed method in a real world scenario. We then want to draw conclusions on the applicability of the developed systems in the real world.
### Detection of hand washing in real time from inertial motion sensors
We want to show that neural network based classification methods can be applied to the recognition of hand washing. We want to base our method on sensor data from inertial measurement sensors in smart watches or other wrist worn IMU-equipped devices. We want to detect the hand washing in real time and directly on the mobile, i.e. wrist wearable device, such as a smart watch. Doing so, we would be able to give instant real time feedback to the user of the device.
We want to show that neural network based classification methods can be applied to the recognition of hand washing. We want to base our method on sensor data from inertial measurement sensors in smart watches or other wrist worn IMU-equipped devices. We want to detect the hand washing in real time and directly on the mobile, i.e. on a wrist wearable device, such as a smart watch. Doing so, we would be able to give instant real time feedback to the user of the device.
### Separation of hand washing and compulsive hand washing
Added to the detection of hand washing, the detection of obsessive-compulsive hand washing is part of our goals. We want to be able to separate compulsive hand washing from non compulsive hand washing, based on the inertial motion data. Especially for the scenario of possible interventions used for the treatment of OCD, this separation is crucial, as patients do also wash their hands in non compulsive ways.
......
......@@ -125,7 +125,7 @@ To incorporate the "chance level" we use majority prediction and uniform random
## Neural network based detection of hand washing
As explained in Section \ref{section:har}, neural networks are the state-of-the-art when it comes to human activity recognition. For hand washing detection, this can also be applied and thus, our classification algorithms are all entirely based on neural networks.
### modifications / preprocessing, train-test-splits, etc. todo name section
### Preprocessing and train-test-split
The data sets were normalized separately to each be mean free and have a standard deviation of $1$. We also tried training all the models without normalization and realized, that the performance on the validation set was better without normalization, which is why we included both a normalized and a non normalized version of the data in our experiment, in order to compare the performance on the test set.
The sensor values were parted in windows using a sliding window approach. After some testing and trying, a window length of $3s$ ($150$ samples) was fixed. We used an overlap of $50\,\%$ ($75$ samples). The external data sets were only used for training. The prerecorded data sets containing hand washing and faked obsessive hand washing were used for training and testing.
We used a train-test-split of $85\,\%$ to $15\,\%$ but split the data on the recording level. This means that training and testing is executed on distinct subsets of subjects, which makes sure that the performance on the test set will give a good estimate of the generalization performance. As every person will wash their hands in a slightly different way, this generalization is needed in the real world in order to also detect unseen but similar patterns of hand washing or obsessive hand washing. The sliding windows were only calculated after splitting for the train test split, to avoid leakage from the test set into the training set.
......@@ -183,7 +183,7 @@ To our knowledge, no previous work exists, that couples DeepConvLSTM with the ex
### Training routines and hyper parameter search
We trained all the models using PyTorch @paszke_pytorch_2019. The data was loaded from the matroska container format using a modified version of the PyAV library. It was then processed in Numpy TODO ref, and converted to PyTorch tensors. The training of the neural networks took place on a GTX 1070 graphics card by NVIDIA.
We trained all the models using PyTorch @paszke_pytorch_2019. The data was loaded from the matroska container format using a modified version of the PyAV library. It was then processed in NumPy @harris_array_2020, and converted to PyTorch tensors. The training of the neural networks took place on a single GTX 1070 graphics card by NVIDIA.
The train data was split in $150\,s$ long windows which were then shuffled before being used to train the models on the different paradigms.
#### Hyper parameter search
......@@ -284,6 +284,7 @@ S\ score &= 2 \cdot \frac{Sensitivity \cdot Specificity}{Sensitivity + Specifici
\end{align}
\end {figure}
\label{s_score}
The sensitivity is the rate of positive samples that get correctly recognized, the specificity the rate of negatives that get correctly recognized. If both these measures are close to 1, the model performs well. The precision is the ratio of true positives contained in all positive predictions. It is similar to the sensitivity but also punishes false positives to some extent. The recall is the same as the sensitivity. The harmonic mean of recall and precision is called F1 score and is also commonly used to evaluate binary prediction tasks. Since we especially need to balance specificity and sensitivity for our task, we also report the S score, which we define as the harmonic mean of specificity and sensitivity. One of the reasons for reporting the S score is the lack of false positive punishment in the F1 score formula. The F1 score does not punish false positives as much as needed in the task of compulsive hand washing detection. While it is partly included in the precision measure, if there are many positives in the ground truth, then the precision wont weigh false positives enough. Including the specificity in the measure therefore makes sure we do not lose track of the false positives, which would be annoying to the user, especially if we send out smart watch notifications with vibration or sound alerts.
For the multiclass problem of distinguishing obsessive hand washing from normal hand washing from other activities, the binary metrics are not applicable. Here, we report normalized confusion matrices, and their mean diagonal values as one performance measure. The confusion matrix shows, which amount of samples belonging to a certain class (true label, rows of the matrix) are predicted to belong to which other class (predicted label, columns of the matrix). The normalized version of the confusion matrix replaces the total values by ratios in proportion to the amount of true labels for each class. This means that for each true label row in the matrix, the values sum to 1.
......@@ -300,7 +301,7 @@ Furthermore, we report an adapted S score for multiclass problem, defined in a s
S\ score\ multi = \frac{1}{3}\cdot \sum_{i=0}^2 S\ score(\mathbf{C}_i)
\end{align}
We also report the metrics used for problems 1 and 2 on a binarized version of the third problem. To binarize the problem, we define either "hand washing" or "compulsive hand washing" as the positive class, and the remainder as negative class. Note that "hand washing" includes "compulsive hand washing". With this binarization, we can compare the models trained on the multiclass problem to the models trained on the initial binary problems. TODO: report these results!
We also report the metrics used for problems 1 on a binarized version of the third problem. To binarize the problem, we define "hand washing" as the positive class, and the remainder as negative class. Note that "hand washing" includes "compulsive hand washing". With this binarization, we can compare the models trained on the multiclass problem to the models trained on the initial binary problem. TODO: report these results!
\label{chained_model}
Added to that, we also report the performance of the best two models for problem 1 and problem 2 chained and then tested on problem 3. This means we execute the best model for hand washing detection first, and then, for all sample windows that were detected as hand washing, we run the best model for the classification of compulsive hand washing vs non compulsive hand washing. From this chain, we can derive three-class predictions by counting all samples that were not detected by the first model as negatives (Null) and the ones predicted to be hand washing, but not predicted to be compulsive by the second model as hand washing (HW). The remaining samples are then classified to be compulsive hand washing (HW-C) by the chained model. This chained model could possibly perform better, as in theory they are two different models, which thus, in combination, have had more training time. However, the method of chaining the models would also take up more space and computation time in the memory of a device, and thus be less efficient.
......
......@@ -46,7 +46,7 @@ The results without smoothing of predictions for the second task, distinguishing
Like for problem 1, applying normalization to the input data worsens the performance of almost all classifiers. The performance loss in the F1 score reaches from $0.024$ (LSTM) to $0.11$ (CNN). For the FC network, the normalization leads to a slight performance increase of $0.01$. The S score performance decrease when we apply normalization is between $0.27$ (CNN) and $0.128$ (DeepConvLSTM-A). As with the F1 scores, the FC network profits off the normalization, here by a difference in S score of $0.035$. SVM and RFC also do not perform better with the application of normalization.
The results for task 2 with the application of smoothing are shown in \ref{tbl:only_conv_hw_rm} and @fig:p2_metrics_rm. Similarly to problem 1, smoothing helps to further increase the performance of all classifiers. All neural network based methods reach F1 scores of over $0.95$. The best F1 score is achieved with DeepConvLSTM-A ($0.966$), the second best with LSTM ($0.965$). The differences remain small for this problem, as DeepConvLSTM ($0.963$) and LSTM-A ($0.961$) also achieve very similar scores. There is a small gap, after which the RFC ($0.922$) and SVM ($0.914$) follow. The traditional methods do not profit as much from the smoothing as the neural network based methods.
The results for task 2 with the application of smoothing are shown in table \ref{tbl:only_conv_hw_rm} and @fig:p2_metrics_rm. Similarly to problem 1, smoothing helps to further increase the performance of all classifiers. All neural network based methods reach F1 scores of over $0.95$. The best F1 score is achieved with DeepConvLSTM-A ($0.966$), the second best with LSTM ($0.965$). The differences remain small for this problem, as DeepConvLSTM ($0.963$) and LSTM-A ($0.961$) also achieve very similar scores. There is a small gap, after which the RFC ($0.922$) and SVM ($0.914$) follow. The traditional methods do not profit as much from the smoothing as the neural network based methods.
The S scores of the neural network based models are also high, with the highest score being $0.911$ (DeepConvLSTM-A), followed by $0.910$ (LSTM), $0.909$ (DeepConvLSTM) and $0.908$ (LSTM-A). The values of CNN ($0.897$) and FC ($0.893$) are not far off either. However, the classical methods RFC ($0.761$) and SVM ($0.724$) do not reach the same level of performance, with the S score gap to the neural network based models even becoming a little bit bigger after the application of smoothing.
......
No preview for this file type
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment