my_bibliography.bib


@article{suRenderCNNViewpoint2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1505.05641},
  primaryClass = {cs},
  title = {Render for {{CNN}}: {{Viewpoint Estimation}} in {{Images Using CNNs Trained}} with {{Rendered 3D Model Views}}},
  doi = {10.1109/ICCV.2015.308},
  shorttitle = {Render for {{CNN}}},
  abstract = {Object viewpoint estimation from 2D images is an essential task in computer vision. However, two issues hinder its progress: scarcity of training data with viewpoint annotations, and a lack of powerful features. Inspired by the growing availability of 3D models, we propose a framework to address both issues by combining render-based image synthesis and CNNs. We believe that 3D models have the potential in generating a large number of images of high variation, which can be well exploited by deep CNN with a high learning capacity. Towards this goal, we propose a scalable and overfit-resistant image synthesis pipeline, together with a novel CNN specifically tailored for the viewpoint estimation task. Experimentally, we show that the viewpoint estimation from our pipeline can significantly outperform state-of-the-art methods on PASCAL 3D+ benchmark.},
  date = {2015-05-21},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Su, Hao and Qi, Charles R. and Li, Yangyan and Guibas, Leonidas},
  file = {/Users/abdullah/Zotero/storage/9G6TZKZT/Su et al. - 2015 - Render for CNN Viewpoint Estimation in Images Usi.pdf;/Users/abdullah/Zotero/storage/EVKTX4X2/1505.html}
}
% == BibLateX quality report for suRenderCNNViewpoint2015:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@article{mccormacSceneNetRGBD5M2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1612.05079},
  primaryClass = {cs},
  title = {{{SceneNet RGB}}-{{D}}: {{5M Photorealistic Images}} of {{Synthetic Indoor Trajectories}} with {{Ground Truth}}},
  url = {http://arxiv.org/abs/1612.05079},
  shorttitle = {{{SceneNet RGB}}-{{D}}},
  abstract = {We introduce SceneNet RGB-D, expanding the previous work of SceneNet to enable large scale photorealistic rendering of indoor scene trajectories. It provides pixel-perfect ground truth for scene understanding problems such as semantic segmentation, instance segmentation, and object detection, and also for geometric computer vision problems such as optical flow, depth estimation, camera pose estimation, and 3D reconstruction. Random sampling permits virtually unlimited scene configurations, and here we provide a set of 5M rendered RGB-D images from over 15K trajectories in synthetic layouts with random but physically simulated object poses. Each layout also has random lighting, camera trajectories, and textures. The scale of this dataset is well suited for pre-training data-driven computer vision techniques from scratch with RGB-D inputs, which previously has been limited by relatively small labelled datasets in NYUv2 and SUN RGB-D. It also provides a basis for investigating 3D scene labelling tasks by providing perfect camera poses and depth data as proxy for a SLAM system. We host the dataset at http://robotvault.bitbucket.io/scenenet-rgbd.html},
  urldate = {2019-05-16},
  date = {2016-12-15},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {McCormac, John and Handa, Ankur and Leutenegger, Stefan and Davison, Andrew J.},
  file = {/Users/abdullah/Zotero/storage/CDXZM66C/McCormac et al. - 2016 - SceneNet RGB-D 5M Photorealistic Images of Synthe.pdf;/Users/abdullah/Zotero/storage/JTV9IDWX/1612.html}
}
% == BibLateX quality report for mccormacSceneNetRGBD5M2016:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{shrivastavaLearningSimulatedUnsupervised2017,
  title = {Learning {{From Simulated}} and {{Unsupervised Images Through Adversarial Training}}},
  url = {http://openaccess.thecvf.com/content_cvpr_2017/html/Shrivastava_Learning_From_Simulated_CVPR_2017_paper.html},
  eventtitle = {Proceedings of the {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  urldate = {2019-05-16},
  date = {2017},
  pages = {2107-2116},
  author = {Shrivastava, Ashish and Pfister, Tomas and Tuzel, Oncel and Susskind, Joshua and Wang, Wenda and Webb, Russell},
  file = {/Users/abdullah/Zotero/storage/ZSWEWGII/Shrivastava et al. - 2017 - Learning From Simulated and Unsupervised Images Th.pdf;/Users/abdullah/Zotero/storage/A5T2Z8NV/Shrivastava_Learning_From_Simulated_CVPR_2017_paper.html}
}
% == BibLateX quality report for shrivastavaLearningSimulatedUnsupervised2017:
% Missing required field 'booktitle'
% ? Title looks like it was stored in title-case in Zotero

@article{tremblayTrainingDeepNetworks2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1804.06516},
  primaryClass = {cs},
  title = {Training {{Deep Networks}} with {{Synthetic Data}}: {{Bridging}} the {{Reality Gap}} by {{Domain Randomization}}},
  url = {http://arxiv.org/abs/1804.06516},
  shorttitle = {Training {{Deep Networks}} with {{Synthetic Data}}},
  abstract = {We present a system for training deep neural networks for object detection using synthetic images. To handle the variability in real-world data, the system relies upon the technique of domain randomization, in which the parameters of the simulator\$-\$such as lighting, pose, object textures, etc.\$-\$are randomized in non-realistic ways to force the neural network to learn the essential features of the object of interest. We explore the importance of these parameters, showing that it is possible to produce a network with compelling performance using only non-artistically-generated synthetic data. With additional fine-tuning on real data, the network yields better performance than using real data alone. This result opens up the possibility of using inexpensive synthetic data for training neural networks while avoiding the need to collect large amounts of hand-annotated real-world data or to generate high-fidelity synthetic worlds\$-\$both of which remain bottlenecks for many applications. The approach is evaluated on bounding box detection of cars on the KITTI dataset.},
  urldate = {2019-05-16},
  date = {2018-04-17},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Tremblay, Jonathan and Prakash, Aayush and Acuna, David and Brophy, Mark and Jampani, Varun and Anil, Cem and To, Thang and Cameracci, Eric and Boochoon, Shaad and Birchfield, Stan},
  file = {/Users/abdullah/Zotero/storage/5GA2RXS2/Tremblay et al. - 2018 - Training Deep Networks with Synthetic Data Bridgi.pdf;/Users/abdullah/Zotero/storage/DVTW9EQ9/1804.html}
}
% == BibLateX quality report for tremblayTrainingDeepNetworks2018:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@article{lowryVisualPlaceRecognition2016,
  title = {Visual {{Place Recognition}}: {{A Survey}}},
  volume = {32},
  issn = {1552-3098},
  doi = {10.1109/TRO.2015.2496823},
  shorttitle = {Visual {{Place Recognition}}},
  abstract = {Visual place recognition is a challenging problem due to the vast range of ways in which the appearance of real-world places can vary. In recent years, improvements in visual sensing capabilities, an ever-increasing focus on long-term mobile robot autonomy, and the ability to draw on state-of-the-art research in other disciplines-particularly recognition in computer vision and animal navigation in neuroscience-have all contributed to significant advances in visual place recognition systems. This paper presents a survey of the visual place recognition research landscape. We start by introducing the concepts behind place recognition-the role of place recognition in the animal kingdom, how a “place” is defined in a robotics context, and the major components of a place recognition system. Long-term robot operations have revealed that changing appearance can be a significant factor in visual place recognition failure; therefore, we discuss how place recognition solutions can implicitly or explicitly account for appearance change within the environment. Finally, we close with a discussion on the future of visual place recognition, in particular with respect to the rapid advances being made in the related fields of deep learning, semantic scene understanding, and video description.},
  number = {1},
  journaltitle = {IEEE Trans. Robot.},
  date = {2016-02},
  pages = {1-19},
  keywords = {animal kingdom,animal navigation,Animals,computer vision,Computer vision,Conferences,deep learning,learning (artificial intelligence),long-term mobile robot autonomy,mobile robots,Navigation,object recognition,place recognition,Robot sensing systems,robot vision,robotics context,semantic scene understanding,video description,video signal processing,Visual place recognition,visual place recognition research landscape,visual place recognition system,visual sensing capabilities,Visualization},
  author = {Lowry, S. and Sünderhauf, N. and Newman, P. and Leonard, J. J. and Cox, D. and Corke, P. and Milford, M. J.},
  file = {/Users/abdullah/Zotero/storage/NVEGIQ4V/Lowry et al. - 2016 - Visual Place Recognition A Survey.pdf;/Users/abdullah/Zotero/storage/88BDTSHG/7339473.html}
}
% == BibLateX quality report for lowryVisualPlaceRecognition2016:
% ? Possibly abbreviated journal title IEEE Trans. Robot.
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{orlandoImageBasedLocalization2019,
  title = {Image {{Based Localization}} with {{Simulated Egocentric Navigations}}},
  doi = {10.5220/0007356503050312},
  date = {2019-01-01},
  pages = {305-312},
  author = {Orlando, Santi and Furnari, Antonino and Battiato, Sebastiano and Farinella, Giovanni},
  file = {/Users/abdullah/Zotero/storage/HWNUWLMB/Orlando et al. - 2019 - Image Based Localization with Simulated Egocentric.pdf}
}
% == BibLateX quality report for orlandoImageBasedLocalization2019:
% Missing required field 'booktitle'
% ? Title looks like it was stored in title-case in Zotero

@article{hinterstoisserPreTrainedImageFeatures2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1710.10710},
  primaryClass = {cs},
  title = {On {{Pre}}-{{Trained Image Features}} and {{Synthetic Images}} for {{Deep Learning}}},
  url = {http://arxiv.org/abs/1710.10710},
  abstract = {Deep Learning methods usually require huge amounts of training data to perform at their full potential, and often require expensive manual labeling. Using synthetic images is therefore very attractive to train object detectors, as the labeling comes for free, and several approaches have been proposed to combine synthetic and real images for training. In this paper, we show that a simple trick is sufficient to train very effectively modern object detectors with synthetic images only: We freeze the layers responsible for feature extraction to generic layers pre-trained on real images, and train only the remaining layers with plain OpenGL rendering. Our experiments with very recent deep architectures for object recognition (Faster-RCNN, R-FCN, Mask-RCNN) and image feature extractors (InceptionResnet and Resnet) show this simple approach performs surprisingly well.},
  urldate = {2019-05-17},
  date = {2017-10-29},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Hinterstoisser, Stefan and Lepetit, Vincent and Wohlhart, Paul and Konolige, Kurt},
  file = {/Users/abdullah/Zotero/storage/JA432ZR5/Hinterstoisser et al. - 2017 - On Pre-Trained Image Features and Synthetic Images.pdf;/Users/abdullah/Zotero/storage/VVR4E92M/1710.html}
}
% == BibLateX quality report for hinterstoisserPreTrainedImageFeatures2017:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{brachmannLearningLessMore2018,
  title = {Learning {{Less Is More}} - {{6D Camera Localization}} via {{3D Surface Regression}}},
  url = {http://openaccess.thecvf.com/content_cvpr_2018/html/Brachmann_Learning_Less_Is_CVPR_2018_paper.html},
  eventtitle = {Proceedings of the {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  urldate = {2019-05-17},
  date = {2018},
  pages = {4654-4662},
  author = {Brachmann, Eric and Rother, Carsten},
  file = {/Users/abdullah/Zotero/storage/8XGZ7LB6/Brachmann and Rother - 2018 - Learning Less Is More - 6D Camera Localization via.pdf;/Users/abdullah/Zotero/storage/M8C4KBZT/Brachmann_Learning_Less_Is_CVPR_2018_paper.html}
}
% == BibLateX quality report for brachmannLearningLessMore2018:
% Missing required field 'booktitle'
% ? Title looks like it was stored in title-case in Zotero

@article{rajpuraObjectDetectionUsing2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1706.06782},
  primaryClass = {cs},
  title = {Object {{Detection Using Deep CNNs Trained}} on {{Synthetic Images}}},
  url = {http://arxiv.org/abs/1706.06782},
  abstract = {The need for large annotated image datasets for training Convolutional Neural Networks (CNNs) has been a significant impediment for their adoption in computer vision applications. We show that with transfer learning an effective object detector can be trained almost entirely on synthetically rendered datasets. We apply this strategy for detecting pack- aged food products clustered in refrigerator scenes. Our CNN trained only with 4000 synthetic images achieves mean average precision (mAP) of 24 on a test set with 55 distinct products as objects of interest and 17 distractor objects. A further increase of 12\% in the mAP is obtained by adding only 400 real images to these 4000 synthetic images in the training set. A high degree of photorealism in the synthetic images was not essential in achieving this performance. We analyze factors like training data set size and 3D model dictionary size for their influence on detection performance. Additionally, training strategies like fine-tuning with selected layers and early stopping which affect transfer learning from synthetic scenes to real scenes are explored. Training CNNs with synthetic datasets is a novel application of high-performance computing and a promising approach for object detection applications in domains where there is a dearth of large annotated image data.},
  urldate = {2019-05-17},
  date = {2017-06-21},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Rajpura, Param S. and Bojinov, Hristo and Hegde, Ravi S.},
  file = {/Users/abdullah/Zotero/storage/QGRSWXH7/Rajpura et al. - 2017 - Object Detection Using Deep CNNs Trained on Synthe.pdf;/Users/abdullah/Zotero/storage/U7KFWG4V/1706.html}
}
% == BibLateX quality report for rajpuraObjectDetectionUsing2017:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@article{wangDeLS3DDeepLocalization2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1805.04949},
  primaryClass = {cs},
  title = {{{DeLS}}-{{3D}}: {{Deep Localization}} and {{Segmentation}} with a {{3D Semantic Map}}},
  url = {http://arxiv.org/abs/1805.04949},
  shorttitle = {{{DeLS}}-{{3D}}},
  abstract = {For applications such as autonomous driving, self-localization/camera pose estimation and scene parsing are crucial technologies. In this paper, we propose a unified framework to tackle these two problems simultaneously. The uniqueness of our design is a sensor fusion scheme which integrates camera videos, motion sensors (GPS/IMU), and a 3D semantic map in order to achieve robustness and efficiency of the system. Specifically, we first have an initial coarse camera pose obtained from consumer-grade GPS/IMU, based on which a label map can be rendered from the 3D semantic map. Then, the rendered label map and the RGB image are jointly fed into a pose CNN, yielding a corrected camera pose. In addition, to incorporate temporal information, a multi-layer recurrent neural network (RNN) is further deployed improve the pose accuracy. Finally, based on the pose from RNN, we render a new label map, which is fed together with the RGB image into a segment CNN which produces per-pixel semantic label. In order to validate our approach, we build a dataset with registered 3D point clouds and video camera images. Both the point clouds and the images are semantically-labeled. Each video frame has ground truth pose from highly accurate motion sensors. We show that practically, pose estimation solely relying on images like PoseNet may fail due to street view confusion, and it is important to fuse multiple sensors. Finally, various ablation studies are performed, which demonstrate the effectiveness of the proposed system. In particular, we show that scene parsing and pose estimation are mutually beneficial to achieve a more robust and accurate system.},
  urldate = {2019-05-17},
  date = {2018-05-13},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Wang, Peng and Yang, Ruigang and Cao, Binbin and Xu, Wei and Lin, Yuanqing},
  file = {/Users/abdullah/Zotero/storage/WIMDWX5B/Wang et al. - 2018 - DeLS-3D Deep Localization and Segmentation with a.pdf;/Users/abdullah/Zotero/storage/CZSXRLRD/1805.html}
}
% == BibLateX quality report for wangDeLS3DDeepLocalization2018:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@article{acharyaBIMPoseNetIndoorCamera2019,
  title = {{{BIM}}-{{PoseNet}}: {{Indoor}} Camera Localisation Using a {{3D}} Indoor Model and Deep Learning from Synthetic Images},
  volume = {150},
  doi = {10.1016/j.isprsjprs.2019.02.020},
  shorttitle = {{{BIM}}-{{PoseNet}}},
  abstract = {The ubiquity of cameras built in mobile devices has resulted in a renewed interest in image-based localisation in indoor environments where the global navigation satellite system (GNSS) signals are not available. Existing approaches for indoor localisation using images either require an initial location or need first to perform a 3D reconstruction of the whole environment using structure-from-motion (SfM) methods, which is challenging and time-consuming for large indoor spaces. In this paper, a visual localisation approach is proposed to eliminate the requirement of image-based reconstruction of the indoor environment by using a 3D indoor model. A deep convolutional neural network (DCNN) is fine-tuned using synthetic images obtained from the 3D indoor model to regress the camera pose. Results of the experiments indicate that the proposed approach can be used for indoor localisation in real-time with an accuracy of approximately 2 m.},
  journaltitle = {ISPRS Journal of Photogrammetry and Remote Sensing},
  date = {2019-03-06},
  pages = {245-258},
  author = {Acharya, Debaditya and Khoshelham, Kourosh and Winter, Stephan},
  file = {/Users/abdullah/Zotero/storage/L7ZF2XUJ/Acharya et al. - 2019 - BIM-PoseNet Indoor camera localisation using a 3D.pdf}
}

@inproceedings{lofflerEvaluationCriteriaInsideOut2018,
  title = {Evaluation {{Criteria}} for {{Inside}}-{{Out Indoor Positioning Systems Based}} on {{Machine Learning}}},
  doi = {10.1109/IPIN.2018.8533862},
  abstract = {Real-time tracking allows to trace goods and enables the optimization of logistics processes in many application areas. Camera-based inside-out tracking that uses an infrastructure of fixed and known markers is costly as the markers need to be installed and maintained in the environment. Instead, systems that use natural markers suffer from changes in the physical environment. Recently a number of approaches based on machine learning (ML) aim to address such issues. This paper proposes evaluation criteria that consider algorith-mic properties of ML-based positioning schemes and introduces a dataset from an indoor warehouse scenario to evaluate for them. Our dataset consists of images labeled with millimeter precise positions that allows for a better development and performance evaluation of learning algorithms. This allows an evaluation of machine learning algorithms for monocular optical positioning in a realistic indoor position application for the first time. We also show the feasibility of ML-based positioning schemes for an industrial deployment.},
  date = {2018-09-24},
  author = {Löffler, Christoffer and Riechel, Sascha and Fischer, Janina and Mutschler, Christopher}
}
% == BibLateX quality report for lofflerEvaluationCriteriaInsideOut2018:
% Missing required field 'booktitle'
% ? Title looks like it was stored in title-case in Zotero

@article{yassinRecentAdvancesIndoor2016,
  title = {Recent {{Advances}} in {{Indoor Localization}}: {{A Survey}} on {{Theoretical Approaches}} and {{Applications}}},
  volume = {PP},
  doi = {10.1109/COMST.2016.2632427},
  shorttitle = {Recent {{Advances}} in {{Indoor Localization}}},
  abstract = {The availability of location information has become a key factor in today’s communications systems allowing location based services. In outdoor scenarios, the mobile terminal position is obtained with high accuracy thanks to the Global Positioning System (GPS) or to the standalone cellular systems. However, the main problem of GPS and cellular systems resides in the indoor environment and in scenarios with deep shadowing effects where the satellite or cellular signals are broken. In this paper, we survey different technologies and methodologies for indoor and outdoor localization with an emphasis on indoor methodologies and concepts. Additionally, we discuss in this review different localization-based applications, where the location information is critical to estimate. Finally, a comprehensive discussion of the challenges in terms of accuracy, cost, complexity, security, scalability, etc. is given. The aim of this survey is to provide a comprehensive overview of existing efforts as well as auspicious and anticipated dimensions for future work in indoor localization techniques and applications.},
  journaltitle = {IEEE Communications Surveys \& Tutorials},
  date = {2016-11-29},
  pages = {1-1},
  author = {Yassin, Ali and Nasser, Youssef and Awad, Mariette and Al-Dubai, Ahmed and Liu, Ran and Yuen, Chau and Raulefs, Ronald},
  file = {/Users/abdullah/Zotero/storage/HK3TXFRK/Yassin et al. - 2016 - Recent Advances in Indoor Localization A Survey o.pdf}
}
% == BibLateX quality report for yassinRecentAdvancesIndoor2016:
% ? Title looks like it was stored in title-case in Zotero

@article{simonyanDeepConvolutionalNetworks2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1312.6034},
  primaryClass = {cs},
  title = {Deep {{Inside Convolutional Networks}}: {{Visualising Image Classification Models}} and {{Saliency Maps}}},
  url = {http://arxiv.org/abs/1312.6034},
  shorttitle = {Deep {{Inside Convolutional Networks}}},
  abstract = {This paper addresses the visualisation of image classification models, learnt using deep Convolutional Networks (ConvNets). We consider two visualisation techniques, based on computing the gradient of the class score with respect to the input image. The first one generates an image, which maximises the class score [Erhan et al., 2009], thus visualising the notion of the class, captured by a ConvNet. The second technique computes a class saliency map, specific to a given image and class. We show that such maps can be employed for weakly supervised object segmentation using classification ConvNets. Finally, we establish the connection between the gradient-based ConvNet visualisation methods and deconvolutional networks [Zeiler et al., 2013].},
  urldate = {2019-05-20},
  date = {2013-12-20},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Simonyan, Karen and Vedaldi, Andrea and Zisserman, Andrew},
  file = {/Users/abdullah/Zotero/storage/TQYCLX8I/Simonyan et al. - 2013 - Deep Inside Convolutional Networks Visualising Im.pdf;/Users/abdullah/Zotero/storage/M4K8NYGC/1312.html}
}
% == BibLateX quality report for simonyanDeepConvolutionalNetworks2013:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@article{sunderhaufPerformanceConvNetFeatures2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1501.04158},
  primaryClass = {cs},
  title = {On the {{Performance}} of {{ConvNet Features}} for {{Place Recognition}}},
  url = {http://arxiv.org/abs/1501.04158},
  abstract = {After the incredible success of deep learning in the computer vision domain, there has been much interest in applying Convolutional Network (ConvNet) features in robotic fields such as visual navigation and SLAM. Unfortunately, there are fundamental differences and challenges involved. Computer vision datasets are very different in character to robotic camera data, real-time performance is essential, and performance priorities can be different. This paper comprehensively evaluates and compares the utility of three state-of-the-art ConvNets on the problems of particular relevance to navigation for robots; viewpoint-invariance and condition-invariance, and for the first time enables real-time place recognition performance using ConvNets with large maps by integrating a variety of existing (locality-sensitive hashing) and novel (semantic search space partitioning) optimization techniques. We present extensive experiments on four real world datasets cultivated to evaluate each of the specific challenges in place recognition. The results demonstrate that speed-ups of two orders of magnitude can be achieved with minimal accuracy degradation, enabling real-time performance. We confirm that networks trained for semantic place categorization also perform better at (specific) place recognition when faced with severe appearance changes and provide a reference for which networks and layers are optimal for different aspects of the place recognition problem.},
  urldate = {2019-05-20},
  date = {2015-01-17},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Robotics},
  author = {Sünderhauf, Niko and Dayoub, Feras and Shirazi, Sareh and Upcroft, Ben and Milford, Michael},
  file = {/Users/abdullah/Zotero/storage/4ZE2WRQF/Sünderhauf et al. - 2015 - On the Performance of ConvNet Features for Place R.pdf;/Users/abdullah/Zotero/storage/GMT4F3LJ/1501.html}
}
% == BibLateX quality report for sunderhaufPerformanceConvNetFeatures2015:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{balntasRelocNetContinuousMetric2018a,
  title = {{{RelocNet}}: {{Continuous Metric Learning Relocalisation}} Using {{Neural Nets}}},
  url = {http://openaccess.thecvf.com/content_ECCV_2018/html/Vassileios_Balntas_RelocNet_Continous_Metric_ECCV_2018_paper.html},
  shorttitle = {{{RelocNet}}},
  eventtitle = {Proceedings of the {{European Conference}} on {{Computer Vision}} ({{ECCV}})},
  urldate = {2019-06-01},
  date = {2018},
  pages = {751-767},
  author = {Balntas, Vassileios and Li, Shuda and Prisacariu, Victor},
  file = {/Users/abdullah/Zotero/storage/R2R4AVI8/Balntas et al. - 2018 - RelocNet Continuous Metric Learning Relocalisatio.pdf;/Users/abdullah/Zotero/storage/283RBUN6/Vassileios_Balntas_RelocNet_Continous_Metric_ECCV_2018_paper.html}
}
% == BibLateX quality report for balntasRelocNetContinuousMetric2018a:
% Missing required field 'booktitle'

@article{muellerCNNBASEDINITIALLOCALIZATION2018,
  langid = {english},
  title = {{{CNN}}-{{BASED INITIAL LOCALIZATION IMPROVED BY DATA AUGMENTATION}}},
  volume = {IV-1},
  issn = {2194-9050},
  doi = {10.5194/isprs-annals-IV-1-117-2018},
  abstract = {Image-based localization or camera re-localization is a fundamental task in computer vision and mandatory in the ﬁelds of navigation for robotics and autonomous driving or for virtual and augmented reality. Such image pose regression in 6 Degrees of Freedom (DoF) is recently solved by Convolutional Neural Networks (CNNs). However, already well-established methods based on feature matching still score higher accuracies so far. Therefore, we want to investigate how data augmentation could further improve CNN-based pose regression. Data augmentation is a valuable technique to boost performance on training based methods and wide spread in the computer vision community. Our aim in this paper is to show the beneﬁt of data augmentation for pose regression by CNNs. For this purpose images are rendered from a 3D model of the actual test environment. This model again is generated by the original training data set, whereas no additional information nor data is required. Furthermore we introduce different training sets composed of rendered and real images. It is shown that the enhanced training of CNNs by utilizing 3D models of the environment improves the image localization accuracy. The accuracy of pose regression could be improved up to 69.37\% for the position component and 61.61\% for the rotation component on our investigated data set.},
  journaltitle = {ISPRS Ann. Photogramm. Remote Sens. Spatial Inf. Sci.},
  date = {2018-09-26},
  pages = {117-124},
  author = {Mueller, M. S. and Metzger, A. and Jutzi, B.},
  file = {/Users/abdullah/Zotero/storage/G8YRPBL2/Mueller et al. - 2018 - CNN-BASED INITIAL LOCALIZATION IMPROVED BY DATA AU.pdf}
}
% == BibLateX quality report for muellerCNNBASEDINITIALLOCALIZATION2018:
% ? Possibly abbreviated journal title ISPRS Ann. Photogramm. Remote Sens. Spatial Inf. Sci.
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{caiHybridProbabilisticModel2018,
  title = {A {{Hybrid Probabilistic Model}} for {{Camera Relocalization}}},
  abstract = {We present a hybrid deep learning method for modelling the uncertainty of camera relocalization from a single RGB image. The proposed system leverages the discriminative deep image representation from a convolutional neural networks, and uses Gaussian Process regressors to generate the probability distribution of the six degree of freedom (6DoF) camera pose in an end-to-end fashion. This results in a network that can generate uncertainties over its inferences with no need to sample many times. Furthermore we show that our objective based on KL divergence reduces the dependence on the choice of hyperparameters. The results show that compared to the state-of-the-art Bayesian camera relocalization method, our model produces comparable localization uncertainty and improves the system efficiency significantly, without loss of accuracy.},
  booktitle = {{{BMVC}}},
  date = {2018},
  keywords = {Approximation algorithm,Artificial neural network,Convolutional neural network,Deep learning,End-to-end principle,Gaussian process,KL-ONE,Kriging,The Australian},
  author = {Cai, Ming and Shen, Chunhua and Reid, Ian D.},
  file = {/Users/abdullah/Zotero/storage/MPZJ25A4/Cai et al. - 2018 - A Hybrid Probabilistic Model for Camera Relocaliza.pdf}
}
% == BibLateX quality report for caiHybridProbabilisticModel2018:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{kovalevDeepLearningTheano2016,
  title = {Deep {{Learning}} with {{Theano}}, {{Torch}}, {{Caffe}}, {{TensorFlow}}, and {{Deeplearning4J}}: {{Which One Is}} the {{Best}} in {{Speed}} and {{Accuracy}}?},
  shorttitle = {Deep {{Learning}} with {{Theano}}, {{Torch}}, {{Caffe}}, {{TensorFlow}}, and {{Deeplearning4J}}},
  abstract = {This paper presents results of a comparative study of the leading Deep Learning frameworks, including Theano (with Keras wrapper), Torch, Caffe, TensorFlow, and Deeplearning4J. Detailed results of quantitative assessment of their training and predicting speed, as well as resultant classification accuracy, are provided. The research was conducted jointly by the United Institute of Informatics Problems (Belarus National Academy of Sciences) and Altoros, a global provider of big data and Platform-as-a-Service solutions.},
  date = {2016-10-03},
  author = {Kovalev, Vassili and Kalinovsky, Alexander and Kovalev, Sergey},
  file = {/Users/abdullah/Zotero/storage/AEIR9UCS/Kovalev et al. - 2016 - Deep Learning with Theano, Torch, Caffe, TensorFlo.pdf}
}
% == BibLateX quality report for kovalevDeepLearningTheano2016:
% Missing required field 'booktitle'
% ? Title looks like it was stored in title-case in Zotero

@article{guIndoorLocalizationImproved2019,
  title = {Indoor {{Localization Improved}} by {{Spatial Context}} - {{A Survey}}},
  volume = {52},
  abstract = {Indoor localization is essential for healthcare, security, augmented reality gaming, and many other location-based services. There is currently a wealth of relevant literature on indoor localization. This paper focuses on recent advances in indoor localization methods that use spatial context to improve the location estimation. Spatial context in the form of maps and spatial models have been used to improve the localization by constraining location estimates in the navigable parts of indoor environments. Landmarks such as doors and corners, which are also one form of spatial context, have proved useful in assisting indoor localization by correcting the localization error. This survey gives a comprehensive review of state-of-the-art indoor localization methods and localization
improvement methods using maps, spatial models, and landmarks.},
  journaltitle = {ACM Computing Surveys},
  date = {2019-06-10},
  pages = {64:1-35},
  author = {Gu, Fuqiang and Hu, Xuke and Ramezani, Milad and Acharya, Debaditya and Khoshelham, Kourosh and Valaee, Shahrokh and Shang, Jianga},
  file = {/Users/abdullah/Zotero/storage/ATALC3TA/Gu et al. - 2019 - Indoor Localization Improved by Spatial Context - .pdf}
}
% == BibLateX quality report for guIndoorLocalizationImproved2019:
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{haImagebasedIndoorLocalization2018,
  langid = {english},
  location = {{Taipei, Taiwan}},
  title = {Image-Based {{Indoor Localization Using BIM}} and {{Features}} of {{CNN}}},
  doi = {10.22260/ISARC2018/0107},
  abstract = {This study suggests an indoor localization method to estimate the location of a user of a mobile device with imaging capability. The proposed method uses a matching approach between an actual photograph and a rendered BIM (building information modeling) image. A pre-trained VGG 16 network is used for feature extraction. Experimental results show that the best image matching performance can be obtained when using features from pooling layer 4 of VGG16. The proposed method allows for indoor localization only by image matching without additional sensing information.},
  eventtitle = {34th {{International Symposium}} on {{Automation}} and {{Robotics}} in {{Construction}}},
  date = {2018-07-22},
  author = {Ha, Inhae and Kim, Hongjo and Park, Somin and Kim, Hyoungkwan},
  file = {/Users/abdullah/Zotero/storage/QH9PMFU6/Ha et al. - 2018 - Image-based Indoor Localization Using BIM and Feat.pdf}
}
% == BibLateX quality report for haImagebasedIndoorLocalization2018:
% Missing required field 'booktitle'

@article{prueferTobiasHassenkloeverKlassifikation,
  langid = {german},
  title = {Tobias Hassenklöver Klassiﬁkation hochvarianter Muster mit Faltungsnetzwerken},
  abstract = {The classiﬁcation of objects and especially of humans by software is for years a great challenge. These highly variant forms are optically captured and often detected with the use of neural networks in combination with statistical methods. A new method for the detection of these highly variant patterns, such as characters, objects or people are Convolutional neural networks. Convolutional neural networks are a variety of Neural networks, which make their decisions based on algorithms used in image processing. In this thesis the limits of the detection of Convolutional neural networks are tested. In the tests the impact on the classiﬁcation accuracy is checked with various modiﬁed input data.},
  pages = {59},
  author = {Prüfer, Betreuender and Fohl, Dr Wolfgang},
  file = {/Users/abdullah/Zotero/storage/2KIIQBFT/Prüfer and Fohl - Tobias Hassenklöver Klassiﬁkation hochvarianter Mu.pdf}
}
% == BibLateX quality report for prueferTobiasHassenkloeverKlassifikation:
% Exactly one of 'date' / 'year' must be present
% Missing required field 'journaltitle'

@article{kendallGeometryUncertaintyinDeep,
  langid = {english},
  title = {Geometry and {{Uncertaintyin Deep Learning}} for {{Computer Vision}}},
  pages = {208},
  author = {Kendall, Alex Guy},
  file = {/Users/abdullah/Zotero/storage/CK2KUICS/Kendall - Geometry and Uncertaintyin Deep Learning for Compu.pdf}
}
% == BibLateX quality report for kendallGeometryUncertaintyinDeep:
% Exactly one of 'date' / 'year' must be present
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{furukawaInternetscaleMultiviewStereo2010,
  langid = {english},
  location = {{San Francisco, CA, USA}},
  title = {Towards {{Internet}}-Scale Multi-View Stereo},
  isbn = {978-1-4244-6984-0},
  doi = {10.1109/CVPR.2010.5539802},
  abstract = {This paper introduces an approach for enabling existing multi-view stereo methods to operate on extremely large unstructured photo collections. The main idea is to decompose the collection into a set of overlapping sets of photos that can be processed in parallel, and to merge the resulting reconstructions. This overlapping clustering problem is formulated as a constrained optimization and solved iteratively. The merging algorithm, designed to be parallel and out-of-core, incorporates robust ﬁltering steps to eliminate low-quality reconstructions and enforce global visibility constraints. The approach has been tested on several large datasets downloaded from Flickr.com, including one with over ten thousand images, yielding a 3D reconstruction with nearly thirty million points.},
  eventtitle = {2010 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {2010 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  publisher = {{IEEE}},
  date = {2010-06},
  pages = {1434-1441},
  author = {Furukawa, Yasutaka and Curless, Brian and Seitz, Steven M. and Szeliski, Richard},
  file = {/Users/abdullah/Zotero/storage/366VAF5B/Furukawa et al. - 2010 - Towards Internet-scale multi-view stereo.pdf}
}
% == BibLateX quality report for furukawaInternetscaleMultiviewStereo2010:
% ? Unsure about the formatting of the booktitle

@article{piascoSurveyVisualBasedLocalization2018,
  title = {A Survey on {{Visual}}-{{Based Localization}}: {{On}} the Benefit of Heterogeneous Data},
  volume = {74},
  doi = {10.1016/j.patcog.2017.09.013},
  shorttitle = {A Survey on {{Visual}}-{{Based Localization}}},
  abstract = {We are surrounded by plenty of information about our environment. From these multiple sources, numerous data could be extracted: set of images, 3D model, coloured points cloud... When classical localization devices failed (e.g. GPS sensor in cluttered environments), aforementioned data could be used within a localization framework. This is called Visual Based Localization (VBL). Due to numerous data types that can be collected from a scene, VBL encompasses a large amount of different methods. This paper presents a survey about recent methods that localize a visual acquisition system according to a known environment. We start by categorizing VBL methods into two distinct families: indirect and direct localization systems. As the localization environment is almost always dynamic, we pay special attention to methods designed to handle appearances changes occurring in a scene. Thereafter, we highlight methods exploiting heterogeneous types of data. Finally, we conclude the paper with a discussion on promising trends that could permit to a localization system to reach high precision pose estimation within an area as large as possible.},
  journaltitle = {Pattern Recognit.},
  date = {2018-02},
  pages = {90 - 109},
  keywords = {Camera Relocalisation,Image-based localization,Pose estimation,Visual geo-localization},
  author = {Piasco, Nathan and Sidibé, Désiré and Demonceaux, Cédric and Gouet-Brunet, Valérie},
  file = {/Users/abdullah/Zotero/storage/97H79C5H/Piasco et al. - 2018 - A survey on Visual-Based Localization On the bene.pdf}
}
% == BibLateX quality report for piascoSurveyVisualBasedLocalization2018:
% ? Possibly abbreviated journal title Pattern Recognit.

@inproceedings{shottonSceneCoordinateRegression2013,
  title = {Scene {{Coordinate Regression Forests}} for {{Camera Relocalization}} in {{RGB}}-{{D Images}}},
  doi = {10.1109/CVPR.2013.377},
  eventtitle = {Proceedings of the {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  date = {2013},
  pages = {2930-2937},
  author = {Shotton, Jamie and Glocker, Ben and Zach, Christopher and Izadi, Shahram and Criminisi, Antonio and Fitzgibbon, Andrew},
  file = {/Users/abdullah/Zotero/storage/K49T3LT8/Shotton et al. - 2013 - Scene Coordinate Regression Forests for Camera Rel.pdf;/Users/abdullah/Zotero/storage/N6LI2KWX/Shotton_Scene_Coordinate_Regression_2013_CVPR_paper.html}
}
% == BibLateX quality report for shottonSceneCoordinateRegression2013:
% Missing required field 'booktitle'
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{varolLearningSyntheticHumans2017,
  langid = {english},
  location = {{Honolulu, HI}},
  title = {Learning from {{Synthetic Humans}}},
  isbn = {978-1-5386-0457-1},
  doi = {10.1109/CVPR.2017.492},
  abstract = {Estimating human pose, shape, and motion from images and videos are fundamental challenges with many applications. Recent advances in 2D human pose estimation use large amounts of manually-labeled training data for learning convolutional neural networks (CNNs). Such data is time consuming to acquire and difﬁcult to extend. Moreover, manual labeling of 3D pose, depth and motion is impractical. In this work we present SURREAL (Synthetic hUmans foR REAL tasks): a new large-scale dataset with synthetically-generated but realistic images of people rendered from 3D sequences of human motion capture data. We generate more than 6 million frames together with ground truth pose, depth maps, and segmentation masks. We show that CNNs trained on our synthetic dataset allow for accurate human depth estimation and human part segmentation in real RGB images. Our results and the new dataset open up new possibilities for advancing person analysis using cheap and large-scale synthetic data.},
  eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  date = {2017-07},
  pages = {4627-4635},
  author = {Varol, Gul and Romero, Javier and Martin, Xavier and Mahmood, Naureen and Black, Michael J. and Laptev, Ivan and Schmid, Cordelia},
  file = {/Users/abdullah/Zotero/storage/QFNDJR64/Varol et al. - 2017 - Learning from Synthetic Humans.pdf}
}
% == BibLateX quality report for varolLearningSyntheticHumans2017:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{kendallGeometricLossFunctions2017,
  langid = {english},
  location = {{Honolulu, HI}},
  title = {Geometric {{Loss Functions}} for {{Camera Pose Regression}} with {{Deep Learning}}},
  isbn = {978-1-5386-0457-1},
  doi = {10.1109/CVPR.2017.694},
  abstract = {Deep learning has shown to be effective for robust and real-time monocular image relocalisation. In particular, PoseNet [22] is a deep convolutional neural network which learns to regress the 6-DOF camera pose from a single image. It learns to localize using high level features and is robust to difﬁcult lighting, motion blur and unknown camera intrinsics, where point based SIFT registration fails. However, it is trained using a naive loss function, with hyperparameters which require expensive tuning. In this paper, we give the problem a more fundamental theoretical treatment. We explore a number of novel loss functions for learning camera pose which are based on geometry and scene reprojection error. Additionally we show how to automatically learn an optimal weighting to simultaneously regress position and orientation. By leveraging geometry, we demonstrate that our technique signiﬁcantly improves PoseNet’s performance across datasets ranging from indoor rooms to a small city.},
  eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  date = {2017-07},
  pages = {6555-6564},
  author = {Kendall, Alex and Cipolla, Roberto},
  file = {/Users/abdullah/Zotero/storage/T2W2SQNP/Kendall und Cipolla - 2017 - Geometric Loss Functions for Camera Pose Regressio.pdf}
}
% == BibLateX quality report for kendallGeometricLossFunctions2017:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{wangDeepVOEndtoendVisual2017,
  langid = {english},
  location = {{Singapore, Singapore}},
  title = {{{DeepVO}}: {{Towards}} End-to-End Visual Odometry with Deep {{Recurrent Convolutional Neural Networks}}},
  isbn = {978-1-5090-4633-1},
  doi = {10.1109/ICRA.2017.7989236},
  shorttitle = {{{DeepVO}}},
  abstract = {This paper studies monocular visual odometry (VO) problem. Most of existing VO algorithms are developed under a standard pipeline including feature extraction, feature matching, motion estimation, local optimisation, etc. Although some of them have demonstrated superior performance, they usually need to be carefully designed and speciﬁcally ﬁne-tuned to work well in different environments. Some prior knowledge is also required to recover an absolute scale for monocular VO. This paper presents a novel end-to-end framework for monocular VO by using deep Recurrent Convolutional Neural Networks (RCNNs). Since it is trained and deployed in an end-to-end manner, it infers poses directly from a sequence of raw RGB images (videos) without adopting any module in the conventional VO pipeline. Based on the RCNNs, it not only automatically learns effective feature representation for the VO problem through Convolutional Neural Networks, but also implicitly models sequential dynamics and relations using deep Recurrent Neural Networks. Extensive experiments on the KITTI VO dataset show competitive performance to state-ofthe-art methods, verifying that the end-to-end Deep Learning technique can be a viable complement to the traditional VO systems.},
  eventtitle = {2017 {{IEEE International Conference}} on {{Robotics}} and {{Automation}} ({{ICRA}})},
  booktitle = {2017 {{IEEE International Conference}} on {{Robotics}} and {{Automation}} ({{ICRA}})},
  publisher = {{IEEE}},
  date = {2017-05},
  pages = {2043-2050},
  author = {Wang, Sen and Clark, Ronald and Wen, Hongkai and Trigoni, Niki},
  file = {/Users/abdullah/Zotero/storage/2JTAL47J/Wang et al. - 2017 - DeepVO Towards end-to-end visual odometry with de.pdf}
}
% == BibLateX quality report for wangDeepVOEndtoendVisual2017:
% ? Unsure about the formatting of the booktitle

@article{costanteExploringRepresentationLearning2016,
  langid = {english},
  title = {Exploring {{Representation Learning With CNNs}} for {{Frame}}-to-{{Frame Ego}}-{{Motion Estimation}}},
  volume = {1},
  issn = {2377-3766, 2377-3774},
  doi = {10.1109/LRA.2015.2505717},
  abstract = {Visual Ego-Motion Estimation, or brieﬂy Visual Odometry (VO), is one of the key building blocks of modern SLAM systems. In the last decade, impressive results have been demonstrated in the context of visual navigation, reaching very high localization performance. However, all ego-motion estimation systems require careful parameter tuning procedures for the speciﬁc environment they have to work in. Furthermore, even in ideal scenarios, most state-of-the-art approaches fail to handle image anomalies and imperfections, which results in less robust estimates. VO systems that rely on geometrical approaches extract sparse or dense features and match them to perform Frame to Frame (F2F) motion estimation. However, images contain much more information that can be used to further improve the F2F estimation. To learn new feature representation a very successful approach is to use deep Convolutional Neural Networks. Inspired by recent advances in Deep Networks and by previous work on learning methods applied to VO, we explore the use of Convolutional Neural Networks to learn both the best visual features and the best estimator for the task of visual Ego-Motion Estimation. With experiments on publicly available datasets we show that our approach is robust with respect to blur, luminance and contrast anomalies and outperforms most state-of-the-art approaches even in nominal conditions.},
  number = {1},
  journaltitle = {IEEE Robot. Autom. Lett.},
  date = {2016-01},
  pages = {18-25},
  author = {Costante, Gabriele and Mancini, Michele and Valigi, Paolo and Ciarfuglia, Thomas A.},
  file = {/Users/abdullah/Zotero/storage/YPIGDKIY/Costante et al. - 2016 - Exploring Representation Learning With CNNs for Fr.pdf}
}
% == BibLateX quality report for costanteExploringRepresentationLearning2016:
% 'issn': not a valid ISSN
% ? Possibly abbreviated journal title IEEE Robot. Autom. Lett.
% ? Title looks like it was stored in title-case in Zotero

@incollection{melekhovRelativeCameraPose2017,
  langid = {english},
  location = {{Cham}},
  title = {Relative {{Camera Pose Estimation Using Convolutional Neural Networks}}},
  volume = {10617},
  isbn = {978-3-319-70352-7 978-3-319-70353-4},
  abstract = {This paper presents a convolutional neural network based approach for estimating the relative pose between two cameras. The proposed network takes RGB images from both cameras as input and directly produces the relative rotation and translation as output. The system is trained in an end-to-end manner utilising transfer learning from a large scale classiﬁcation dataset. The introduced approach is compared with widely used local feature based methods (SURF, ORB) and the results indicate a clear improvement over the baseline. In addition, a variant of the proposed architecture containing a spatial pyramid pooling (SPP) layer is evaluated and shown to further improve the performance.},
  booktitle = {Advanced {{Concepts}} for {{Intelligent Vision Systems}}},
  publisher = {{Springer International Publishing}},
  date = {2017},
  pages = {675-687},
  author = {Melekhov, Iaroslav and Ylioinas, Juha and Kannala, Juho and Rahtu, Esa},
  editor = {Blanc-Talon, Jacques and Penne, Rudi and Philips, Wilfried and Popescu, Dan and Scheunders, Paul},
  file = {/Users/abdullah/Zotero/storage/C82DPP7X/Melekhov et al. - 2017 - Relative Camera Pose Estimation Using Convolutiona.pdf},
  doi = {10.1007/978-3-319-70353-4_57}
}
% == BibLateX quality report for melekhovRelativeCameraPose2017:
% 'isbn': not a valid ISBN
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{parisottoGlobalPoseEstimation2018,
  langid = {english},
  location = {{Salt Lake City, UT}},
  title = {Global {{Pose Estimation}} with an {{Attention}}-{{Based Recurrent Network}}},
  isbn = {978-1-5386-6100-0},
  doi = {10.1109/CVPRW.2018.00061},
  abstract = {The ability for an agent to localize itself within an environment is crucial for many real-world applications. For unknown environments, Simultaneous Localization and Mapping (SLAM) enables incremental and concurrent building of and localizing within a map. We present a new, differentiable architecture, Neural Graph Optimizer, progressing towards a complete neural network solution for SLAM by designing a system composed of a local pose estimation model, a novel pose selection module, and a novel graph optimization process. The entire architecture is trained in an end-to-end fashion, enabling the network to automatically learn domain-speciﬁc features relevant to the visual odometry and avoid the involved process of feature engineering. We demonstrate the effectiveness of our system on a simulated 2D maze and the 3D ViZ-Doom environment.},
  eventtitle = {2018 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition Workshops}} ({{CVPRW}})},
  booktitle = {2018 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition Workshops}} ({{CVPRW}})},
  publisher = {{IEEE}},
  date = {2018-06},
  pages = {350-35009},
  author = {Parisotto, Emilio and Chaplot, Devendra Singh and Zhang, Jian and Salakhutdinov, Ruslan},
  file = {/Users/abdullah/Zotero/storage/W4E7VCDI/Parisotto et al. - 2018 - Global Pose Estimation with an Attention-Based Rec.pdf}
}
% == BibLateX quality report for parisottoGlobalPoseEstimation2018:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@article{krizhevskyImageNetClassificationDeep2012,
  title = {{{ImageNet Classification}} with {{Deep Convolutional Neural Networks}}},
  doi = {10.1145/3065386},
  journaltitle = {Adv. Neural Inf. Process. Syst. 25},
  date = {2012},
  pages = {1097--1105},
  author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
  editor = {Pereira, F. and Burges, C. J. C. and Bottou, L. and Weinberger, K. Q.},
  file = {/Users/abdullah/Zotero/storage/U8YR4LH7/Krizhevsky et al. - 2012 - ImageNet Classification with Deep Convolutional Ne.pdf;/Users/abdullah/Zotero/storage/E82FVXG7/4824-imagenet-classification-with-deep-convolutional-neural-networks.html}
}
% == BibLateX quality report for krizhevskyImageNetClassificationDeep2012:
% ? Possibly abbreviated journal title Adv. Neural Inf. Process. Syst. 25
% ? Title looks like it was stored in title-case in Zotero

@article{glockerRealTimeRGBDCamera2015,
  langid = {english},
  title = {Real-{{Time RGB}}-{{D Camera Relocalization}} via {{Randomized Ferns}} for {{Keyframe Encoding}}},
  volume = {21},
  issn = {1077-2626},
  doi = {10.1109/TVCG.2014.2360403},
  abstract = {Recovery from tracking failure is essential in any simultaneous localization and tracking system. In this context, we explore an efﬁcient keyframe-based relocalization method based on frame encoding using randomized ferns. The method enables automatic discovery of keyframes through online harvesting in tracking mode, and fast retrieval of pose candidates in the case when tracking is lost. Frame encoding is achieved by applying simple binary feature tests which are stored in the nodes of an ensemble of randomized ferns. The concatenation of small block codes generated by each fern yields a global compact representation of camera frames. Based on those representations we deﬁne the frame dissimilarity as the block-wise hamming distance (BlockHD). Dissimilarities between an incoming query frame and a large set of keyframes can be efﬁciently evaluated by simply traversing the nodes of the ferns and counting image co-occurrences in corresponding code tables. In tracking mode, those dissimilarities decide whether a frame/pose pair is considered as a novel keyframe. For tracking recovery, poses of the most similar keyframes are retrieved and used for reinitialization of the tracking algorithm. The integration of our relocalization method into a hand-held KinectFusion system allows seamless continuation of mapping even when tracking is frequently lost.},
  number = {5},
  journaltitle = {IEEE Trans. Visual. Comput. Graphics},
  date = {2015-05-01},
  pages = {571-583},
  author = {Glocker, Ben and Shotton, Jamie and Criminisi, Antonio and Izadi, Shahram},
  file = {/Users/abdullah/Zotero/storage/DFK8VGJM/Glocker et al. - 2015 - Real-Time RGB-D Camera Relocalization via Randomiz.pdf}
}
% == BibLateX quality report for glockerRealTimeRGBDCamera2015:
% ? Possibly abbreviated journal title IEEE Trans. Visual. Comput. Graphics
% ? Title looks like it was stored in title-case in Zotero

@article{santosMappingIndoorSpaces2016,
  title = {Mapping {{Indoor Spaces}} by {{Adaptive Coarse}}-to-{{Fine Registration}} of {{RGB}}-{{D Data}}},
  volume = {13},
  issn = {1545-598X},
  doi = {10.1109/LGRS.2015.2508880},
  abstract = {In this letter, we present an adaptive coarse-to-fine registration method for 3-D indoor mapping using RGB-D data. We weight the 3-D points based on the theoretical random error of depth measurements and introduce a novel disparity-based model for an accurate and robust coarse-to-fine registration. Some feature extraction methods required by the method are also presented. First, our method exploits both visual and depth information to compute the initial transformation parameters. We employ scale-invariant feature transformation for extracting, detecting, and matching 2-D visual features, and their associated depth values are used to perform coarse registration. Then, we use an image-based segmentation technique for detecting regions in the RGB images. Their associated 3-D centroid and the correspondent disparity values are used to refine the initial transformation parameters. Finally, the loop-closure detection and a global adjustment of the complete sequence data are used to recognize when the camera has returned to a previously visited location and minimize the registration errors. The effectiveness of the proposed method is demonstrated with the Kinect data set. The experimental results show that the proposed method can properly map the indoor environment with a relative and absolute accuracy value of around 3-5 cm, respectively.},
  number = {2},
  journaltitle = {IEEE Geosci. Remote Sens. Lett.},
  date = {2016-02},
  pages = {262-266},
  keywords = {Cameras,Simultaneous localization and mapping,Visualization,Feature extraction,Three-dimensional displays,2D visual feature matching,3-D indoor mapping,3D centroid,3D indoor mapping,adaptive coarse-to-fine registration method,Coarse-to-fine registration,depth measurements,disparity-based model,disparity-to-plane model,feature detection,feature extraction,feature extraction methods,global optimization,image matching,image registration,image segmentation,Image segmentation,image-based segmentation technique,indoor environment,Kinect data set,loop-closure detection,optical sensors,RGB-D data,RGB-D sensor,scale-invariant feature transformation,Solid modeling,spatial variables measurement,transforms},
  author = {dos Santos, D. R. and Basso, M. A. and Khoshelham, K. and de Oliveira, E. and Pavan, N. L. and Vosselman, G.},
  file = {/Users/abdullah/Zotero/storage/QSHDBCQM/Santos et al. - 2016 - Mapping Indoor Spaces by Adaptive Coarse-to-Fine R.pdf;/Users/abdullah/Zotero/storage/BUYE5C89/7374652.html}
}
% == BibLateX quality report for santosMappingIndoorSpaces2016:
% ? Possibly abbreviated journal title IEEE Geosci. Remote Sens. Lett.
% ? Title looks like it was stored in title-case in Zotero

@article{svarmCityScaleLocalizationCameras2017,
  langid = {english},
  title = {City-{{Scale Localization}} for {{Cameras}} with {{Known Vertical Direction}}},
  volume = {39},
  issn = {0162-8828, 2160-9292},
  doi = {10.1109/TPAMI.2016.2598331},
  abstract = {We consider the problem of localizing a novel image in a large 3D model, given that the gravitational vector is known. In principle, this is just an instance of camera pose estimation, but the scale of the problem introduces some interesting challenges. Most importantly, it makes the correspondence problem very difﬁcult so there will often be a signiﬁcant number of outliers to handle. To tackle this problem, we use recent theoretical as well as technical advances. Many modern cameras and phones have gravitational sensors that allow us to reduce the search space. Further, there are new techniques to efﬁciently and reliably deal with extreme rates of outliers. We extend these methods to camera pose estimation by using accurate approximations and fast polynomial solvers. Experimental results are given demonstrating that it is possible to reliably estimate the camera pose despite cases with more than 99\% outlier correspondences in city-scale models with several millions of 3D points.},
  number = {7},
  journaltitle = {IEEE Trans. Pattern Anal. Mach. Intell.},
  date = {2017-07-01},
  pages = {1455-1461},
  author = {Svarm, Linus and Enqvist, Olof and Kahl, Fredrik and Oskarsson, Magnus},
  file = {/Users/abdullah/Zotero/storage/FEXJL9TH/Svarm et al. - 2017 - City-Scale Localization for Cameras with Known Ver.pdf}
}
% == BibLateX quality report for svarmCityScaleLocalizationCameras2017:
% 'issn': not a valid ISSN
% ? Possibly abbreviated journal title IEEE Trans. Pattern Anal. Mach. Intell.
% ? Title looks like it was stored in title-case in Zotero

@incollection{radenovicCNNImageRetrieval2016,
  langid = {english},
  location = {{Cham}},
  title = {{{CNN Image Retrieval Learns}} from {{BoW}}: {{Unsupervised Fine}}-{{Tuning}} with {{Hard Examples}}},
  volume = {9905},
  isbn = {978-3-319-46447-3 978-3-319-46448-0},
  shorttitle = {{{CNN Image Retrieval Learns}} from {{BoW}}},
  abstract = {Convolutional Neural Networks (CNNs) achieve state-of-theart performance in many computer vision tasks. However, this achievement is preceded by extreme manual annotation in order to perform either training from scratch or ﬁne-tuning for the target task. In this work, we propose to ﬁne-tune CNN for image retrieval from a large collection of unordered images in a fully automated manner. We employ state-of-the-art retrieval and Structure-from-Motion (SfM) methods to obtain 3D models, which are used to guide the selection of the training data for CNN ﬁne-tuning. We show that both hard positive and hard negative examples enhance the ﬁnal performance in particular object retrieval with compact codes.},
  booktitle = {Computer {{Vision}} – {{ECCV}} 2016},
  publisher = {{Springer International Publishing}},
  date = {2016},
  pages = {3-20},
  author = {Radenović, Filip and Tolias, Giorgos and Chum, Ondřej},
  editor = {Leibe, Bastian and Matas, Jiri and Sebe, Nicu and Welling, Max},
  file = {/Users/abdullah/Zotero/storage/DPMRCD36/Radenović et al. - 2016 - CNN Image Retrieval Learns from BoW Unsupervised .pdf},
  doi = {10.1007/978-3-319-46448-0_1}
}
% == BibLateX quality report for radenovicCNNImageRetrieval2016:
% 'isbn': not a valid ISBN
% ? Title looks like it was stored in title-case in Zotero

@article{arandjelovicThreeThingsEveryone2012,
  title = {Three Things Everyone Should Know to Improve Object Retrieval},
  doi = {10.1109/CVPR.2012.6248018},
  abstract = {The objective of this work is object retrieval in large scale image datasets, where the object is specified by an image query and retrieval should be immediate at run time in the manner of Video Google [28]. We make the following three contributions: (i) a new method to compare SIFT descriptors (RootSIFT) which yields superior performance without increasing processing or storage requirements; (ii) a novel method for query expansion where a richer model for the query is learnt discriminatively in a form suited to immediate retrieval through efficient use of the inverted index; (iii) an improvement of the image augmentation method proposed by Turcot and Lowe [29], where only the augmenting features which are spatially consistent with the augmented image are kept. We evaluate these three methods over a number of standard benchmark datasets (Oxford Buildings 5k and 105k, and Paris 6k) and demonstrate substantial improvements in retrieval performance whilst maintaining immediate retrieval speeds. Combining these complementary methods achieves a new state-of-the-art performance on these datasets.},
  journaltitle = {2012 IEEE Conf. Comput. Vis. Pattern Recognit.},
  date = {2012},
  pages = {2911-2918},
  keywords = {Benchmark (computing),Google Videos,Inverted index,Query expansion,Requirement,Run time (program lifecycle phase),Scale-invariant feature transform,While},
  author = {Arandjelovic, Relja and Zisserman, Andrew}
}
% == BibLateX quality report for arandjelovicThreeThingsEveryone2012:
% ? Possibly abbreviated journal title 2012 IEEE Conf. Comput. Vis. Pattern Recognit.

@inproceedings{zhuTenfoldImprovementVisual2007,
  langid = {english},
  location = {{Rio de Janeiro, Brazil}},
  title = {Ten-Fold {{Improvement}} in {{Visual Odometry Using Landmark Matching}}},
  isbn = {978-1-4244-1630-1},
  doi = {10.1109/ICCV.2007.4409062},
  abstract = {Our goal is to create a visual odometry system for robots and wearable systems such that localization accuracies of centimeters can be obtained for hundreds of meters of distance traveled. Existing systems have achieved approximately a 1\% to 5\% localization error rate whereas our proposed system achieves close to 0.1\% error rate, a ten-fold reduction. Traditional visual odometry systems drift over time as the frame-to-frame errors accumulate. In this paper, we propose to improve visual odometry using visual landmarks in the scene. First, a dynamic local landmark tracking technique is proposed to track a set of local landmarks across image frames and select an optimal set of tracked local landmarks for pose computation. As a result, the error associated with each pose computation is minimized to reduce the drift signiﬁcantly. Second, a global landmark based drift correction technique is proposed to recognize previously visited locations and use them to correct drift accumulated during motion. At each visited location along the route, a set of distinctive visual landmarks is automatically extracted and inserted into a landmark database dynamically. We integrate the landmark based approach into a navigation system with 2 stereo pairs and a low-cost Inertial Measurement Unit (IMU) for increased robustness. We demonstrate that a real-time visual odometry system using local and global landmarks can precisely locate a user within 1 meter over 1000 meters in unknown indoor/outdoor environments with challenging situations such as climbing stairs, opening doors, moving foreground objects etc..},
  eventtitle = {2007 {{IEEE}} 11th {{International Conference}} on {{Computer Vision}}},
  booktitle = {2007 {{IEEE}} 11th {{International Conference}} on {{Computer Vision}}},
  publisher = {{IEEE}},
  date = {2007},
  pages = {1-8},
  author = {Zhu, Zhiwei and Oskiper, Taragay and Samarasekera, Supun and Kumar, Rakesh and Sawhney, Harpreet S.},
  file = {/Users/abdullah/Zotero/storage/KQ8LBEMX/Zhu et al. - 2007 - Ten-fold Improvement in Visual Odometry Using Land.pdf}
}
% == BibLateX quality report for zhuTenfoldImprovementVisual2007:
% ? Unsure about the formatting of the booktitle

@inproceedings{xiaoLightweightMapMatching2014,
  langid = {english},
  location = {{Berlin}},
  title = {Lightweight Map Matching for Indoor Localisation Using Conditional Random Fields},
  isbn = {978-1-4799-3146-0 978-1-4799-3147-7},
  doi = {10.1109/IPSN.2014.6846747},
  abstract = {Indoor tracking and navigation is a fundamental need for pervasive and context-aware smartphone applications. Although indoor maps are becoming increasingly available, there is no practical and reliable indoor map matching solution available at present. We present MapCraft, a novel, robust and responsive technique that is extremely computationally efﬁcient (running in under 10 ms on an Android smartphone), does not require training in different sites, and tracks well even when presented with very noisy sensor data. Key to our approach is expressing the tracking problem as a conditional random ﬁeld (CRF), a technique which has had great success in areas such as natural language processing, but has yet to be considered for indoor tracking. Unlike directed graphical models like Hidden Markov Models, CRFs capture arbitrary constraints that express how well observations support state transitions, given map constraints. Extensive experiments in multiple sites show how MapCraft outperforms state-of-the art approaches, demonstrating excellent tracking error and accurate reconstruction of tortuous trajectories with zero training effort. As proof of its robustness, we also demonstrate how it is able to accurately track the position of a user from accelerometer and magnetometer measurements only (i.e. gyro- and WiFi-free). We believe that such an energy-efﬁcient approach will enable alwayson background localisation, enabling a new era of location-aware applications to be developed.},
  eventtitle = {2014 13th {{ACM}}/{{IEEE International Conference}} on {{Information Processing}} in {{Sensor Networks}} ({{IPSN}})},
  booktitle = {{{IPSN}}-14 {{Proceedings}} of the 13th {{International Symposium}} on {{Information Processing}} in {{Sensor Networks}}},
  publisher = {{IEEE}},
  date = {2014-04},
  pages = {131-142},
  author = {Xiao, Zhuoling and Wen, Hongkai and Markham, Andrew and Trigoni, Niki},
  file = {/Users/abdullah/Zotero/storage/4XYGQGWM/Xiao et al. - 2014 - Lightweight map matching for indoor localisation u.pdf}
}
% == BibLateX quality report for xiaoLightweightMapMatching2014:
% 'isbn': not a valid ISBN

@article{zampellaIndoorPositioningUsing2015,
  langid = {english},
  title = {Indoor {{Positioning Using Efficient Map Matching}}, {{RSS Measurements}}, and an {{Improved Motion Model}}},
  volume = {64},
  issn = {0018-9545, 1939-9359},
  doi = {10.1109/TVT.2015.2391296},
  abstract = {Unlike outdoor positioning, there is not a unique solution to obtain the position of a person inside a building or in GNSS denied areas. Typical implementations indoor rely on dead reckoning or beacon based positioning, but a robust estimation must combine several techniques to overcome their own drawbacks. In this paper, we present an indoor positioning system based on foot mounted Pedestrian Dead Reckoning (PDR) with an efﬁcient Map Matching, Received Signal Strength (RSS) measurements and an improved motion model that includes the estimation of the turn rate bias. The system was implemented using a two levels structure with a low level PDR-ﬁlter and a high level particle ﬁlter to include all the measurements.},
  number = {4},
  journaltitle = {IEEE Trans. Veh. Technol.},
  date = {2015-04},
  pages = {1304-1317},
  author = {Zampella, Francisco and Jimenez Ruiz, Antonio Ramon and Seco Granja, Fernando},
  file = {/Users/abdullah/Zotero/storage/KNLCHL6Z/Zampella et al. - 2015 - Indoor Positioning Using Efficient Map Matching, R.pdf}
}
% == BibLateX quality report for zampellaIndoorPositioningUsing2015:
% 'issn': not a valid ISSN
% ? Possibly abbreviated journal title IEEE Trans. Veh. Technol.
% ? Title looks like it was stored in title-case in Zotero

@article{acharyaBIMTrackerModelbasedVisual2019,
  langid = {english},
  title = {{{BIM}}-{{Tracker}}: {{A}} Model-Based Visual Tracking Approach for Indoor Localisation Using a {{3D}} Building Model},
  volume = {150},
  issn = {09242716},
  doi = {10.1016/j.isprsjprs.2019.02.014},
  shorttitle = {{{BIM}}-{{Tracker}}},
  abstract = {This article presents an accurate and robust visual indoor localisation approach that not only is infrastructurefree, but also avoids accumulation error by taking advantage of 1) the widespread ubiquity of mobile devices with cameras and 2) the availability of 3D building models for most modern buildings. Localisation is performed by matching image sequences captured by a camera, with a 3D model of the building in a model-based visual tracking framework. Comprehensive evaluation of the approach with a photo-realistic synthetic dataset shows the robustness of the localisation approach under challenging conditions. Additionally, the approach is tested and evaluated on real data captured by a smartphone. The results of the experiments indicate that a localisation accuracy better than 10 centimetres can be achieved by using this approach. Since localisation errors do not accumulate the proposed approach is suitable for indoor localisation tasks for long periods of time and augmented reality applications, without requiring any local infrastructure. A MATLAB implementation can be found on https://github.com/debaditya-unimelb/BIM-Tracker. For a video demo visit: https://youtu.be/cq7mk4mfdRA.},
  journaltitle = {ISPRS Journal of Photogrammetry and Remote Sensing},
  date = {2019-04},
  pages = {157-171},
  author = {Acharya, Debaditya and Ramezani, Milad and Khoshelham, Kourosh and Winter, Stephan},
  file = {/Users/abdullah/Zotero/storage/ZME9LRNK/Acharya et al. - 2019 - BIM-Tracker A model-based visual tracking approac.pdf}
}

@article{lepetitMonocularModelBased3D2005,
  langid = {english},
  title = {Monocular {{Model}}-{{Based 3D Tracking}} of {{Rigid Objects}}: {{A Survey}}},
  volume = {1},
  issn = {1572-2740, 1572-2759},
  doi = {10.1561/0600000001},
  shorttitle = {Monocular {{Model}}-{{Based 3D Tracking}} of {{Rigid Objects}}},
  abstract = {Many applications require tracking of complex 3D objects. These include visual servoing of robotic arms on speciﬁc target objects, Augmented Reality systems that require real-time registration of the object to be augmented, and head tracking systems that sophisticated interfaces can use. Computer Vision oﬀers solutions that are cheap, practical and non-invasive.},
  number = {1},
  journaltitle = {FNT in Computer Graphics and Vision},
  date = {2005},
  pages = {1-89},
  author = {Lepetit, Vincent and Fua, Pascal},
  file = {/Users/abdullah/Zotero/storage/8QENBZ2L/Lepetit und Fua - 2005 - Monocular Model-Based 3D Tracking of Rigid Objects.pdf}
}
% == BibLateX quality report for lepetitMonocularModelBased3D2005:
% 'issn': not a valid ISSN
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{davisonRealtimeSimultaneousLocalisation2003,
  langid = {english},
  location = {{Nice, France}},
  title = {Real-Time Simultaneous Localisation and Mapping with a Single Camera},
  isbn = {978-0-7695-1950-0},
  doi = {10.1109/ICCV.2003.1238654},
  abstract = {Ego-motion estimation for an agile single camera moving through general, unknown scenes becomes a much more challenging problem when real-time performance is required rather than under the off-line processing conditions under which most successful structure from motion work has been achieved. This task of estimating camera motion from measurements of a continuously expanding set of selfmapped visual features is one of a class of problems known as Simultaneous Localisation and Mapping (SLAM) in the robotics community, and we argue that such real-time mapping research, despite rarely being camera-based, is more relevant here than off-line structure from motion methods due to the more fundamental emphasis placed on propagation of uncertainty.},
  eventtitle = {{{ICCV}} 2003: 9th {{International Conference}} on {{Computer Vision}}},
  booktitle = {Proceedings {{Ninth IEEE International Conference}} on {{Computer Vision}}},
  publisher = {{IEEE}},
  date = {2003},
  pages = {1403-1410 vol.2},
  author = {{Davison}},
  file = {/Users/abdullah/Zotero/storage/E8PCDP4J/Davison - 2003 - Real-time simultaneous localisation and mapping wi.pdf}
}

@article{harleSurveyIndoorInertial23,
  langid = {english},
  title = {A {{Survey}} of {{Indoor Inertial Positioning Systems}} for {{Pedestrians}}},
  volume = {15},
  issn = {1553-877X},
  doi = {10.1109/SURV.2012.121912.00075},
  abstract = {With the continual miniaturisation of sensors and processing nodes, Pedestrian Dead Reckoning (PDR) systems are becoming feasible options for indoor tracking. These use inertial and other sensors, often combined with domain-speciﬁc knowledge about walking, to track user movements. There is currently a wealth of relevant literature spread across different research communities. In this survey, a taxonomy of modern PDRs is developed and used to contextualise the contributions from different areas. Techniques for step detection, characterisation, inertial navigation and step-and-heading-based deadreckoning are reviewed and compared. Techniques that incorporate building maps through particle ﬁlters are analysed, along with hybrid systems that use absolute position ﬁxes to correct dead-reckoning output. In addition, consideration is given to the possibility of using smartphones as PDR sensing devices.},
  number = {3},
  journaltitle = {IEEE Commun. Surv. Tutorials},
  date = {0023/2013},
  pages = {1281-1293},
  author = {Harle, Robert},
  file = {/Users/abdullah/Zotero/storage/MRA2AQD2/Harle - 2013 - A Survey of Indoor Inertial Positioning Systems fo.pdf}
}
% == BibLateX quality report for harleSurveyIndoorInertial23:
% ? Possibly abbreviated journal title IEEE Commun. Surv. Tutorials
% ? Title looks like it was stored in title-case in Zotero

@article{hassanIndoorPositioningUsing,
  langid = {english},
  title = {Indoor {{Positioning Using Visible LED Lights}}: {{A Survey}}},
  pages = {31},
  author = {Hassan, Naveed Ul and Naeem, Aqsa and Pasha, Muhammad Adeel and Jadoon, Tariq},
  file = {/Users/abdullah/Zotero/storage/4DDEFGL5/Hassan et al. - Indoor Positioning Using Visible LED Lights A Sur.pdf}
}
% == BibLateX quality report for hassanIndoorPositioningUsing:
% Exactly one of 'date' / 'year' must be present
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@article{mautzIndoorPositioningTechnologies2012,
  langid = {english},
  title = {Indoor Positioning Technologies},
  doi = {10.3929/ethz-a-007313554},
  date = {2012},
  pages = {1 Band},
  keywords = {Architecture,ASTRO-GEODETIC DETERMINATION OF POSITION + GEOGRAPHICAL COORDINATES (GEODESY),ASTRONOMISCH-GEODÄTISCHE ORTSBESTIMMUNG + GEOGRAPHISCHE KOORDINATEN (GEODÄSIE),Earth sciences,GEODÄTISCHE MESSVERFAHREN (GEODÄSIE),GEODETIC MEASURING METHODS (GEODESY),GEOMATICS (GEOGRAPHY),GEOMATIK (GEOGRAFIE),GLOBAL NAVIGATION SATELLITE SYSTEM; GNSS (GEODÄSIE),GLOBAL NAVIGATION SATELLITE SYSTEM; GNSS (GEODESY),GLOBAL POSITIONING SYSTEM; GPS + INDOOR GPS (GEODÄSIE),GLOBAL POSITIONING SYSTEM; GPS + INDOOR GPS (GEODESY),info:eu-repo/classification/ddc/550,info:eu-repo/classification/ddc/720},
  author = {Mautz, Rainer},
  file = {/Users/abdullah/Zotero/storage/UQFAZ3F2/Mautz - 2012 - Indoor positioning technologies.pdf}
}
% == BibLateX quality report for mautzIndoorPositioningTechnologies2012:
% Missing required field 'journaltitle'

@inproceedings{wuDelvingDeeperConvolutional2017,
  langid = {english},
  location = {{Singapore, Singapore}},
  title = {Delving Deeper into Convolutional Neural Networks for Camera Relocalization},
  isbn = {978-1-5090-4633-1},
  doi = {10.1109/ICRA.2017.7989663},
  abstract = {Convolutional Neural Networks (CNNs) have been applied to camera relocalization, which is to infer the pose of the camera given a single monocular image. However, there are still many open problems for camera relocalization with CNNs. We delve into the CNNs for camera relocalization. First, a variant of Euler angles named Euler6 is proposed to represent orientation. Then a data augmentation method named pose synthesis is designed to reduce sparsity of poses in the whole pose space to cope with overﬁtting in training. Third, a multi-task CNN named BranchNet is proposed to deal with the complex coupling of orientation and translation. The network consists of several shared convolutional layers and splits into two branches which predict orientation and translation, respectively. Experiments on the 7Scenes dataset show that incorporating these techniques one by one into an existing model PoseNet always leads to better results. Together these techniques reduce the orientation error by 15.9\% and the translation error by 38.3\% compared to the state-of-the-art model Bayesian PoseNet. We implement BranchNet on an Intel NUC mobile platform and reach a speed of 43 fps, which meets the real-time requirement of many robotic applications.},
  eventtitle = {2017 {{IEEE International Conference}} on {{Robotics}} and {{Automation}} ({{ICRA}})},
  booktitle = {2017 {{IEEE International Conference}} on {{Robotics}} and {{Automation}} ({{ICRA}})},
  publisher = {{IEEE}},
  date = {2017-05},
  pages = {5644-5651},
  author = {Wu, Jian and Ma, Liwei and Hu, Xiaolin},
  file = {/Users/abdullah/Zotero/storage/3BXUW6N4/Wu et al. - 2017 - Delving deeper into convolutional neural networks .pdf}
}
% == BibLateX quality report for wuDelvingDeeperConvolutional2017:
% ? Unsure about the formatting of the booktitle

@article{mullerSQUEEZEPOSENETIMAGEBASED2017,
  langid = {english},
  title = {{{SQUEEZEPOSENET}}: {{IMAGE BASED POSE REGRESSION WITH SMALL CONVOLUTIONAL NEURAL NETWORKS FOR REAL TIME UAS NAVIGATION}}},
  volume = {IV-2/W3},
  issn = {2194-9050},
  doi = {10.5194/isprs-annals-IV-2-W3-49-2017},
  shorttitle = {{{SQUEEZEPOSENET}}},
  abstract = {The number of unmanned aerial vehicles (UAVs) is increasing since low-cost airborne systems are available for a wide range of users. The outdoor navigation of such vehicles is mostly based on global navigation satellite system (GNSS) methods to gain the vehicles trajectory. The drawback of satellite-based navigation are failures caused by occlusions and multi-path interferences. Beside this, local image-based solutions like Simultaneous Localization and Mapping (SLAM) and Visual Odometry (VO) can e.g. be used to support the GNSS solution by closing trajectory gaps but are computationally expensive. However, if the trajectory estimation is interrupted or not available a re-localization is mandatory. In this paper we will provide a novel method for a GNSS-free and fast image-based pose regression in a known area by utilizing a small convolutional neural network (CNN). With on-board processing in mind, we employ a lightweight CNN called SqueezeNet and use transfer learning to adapt the network to pose regression. Our experiments show promising results for GNSS-free and fast localization.},
  journaltitle = {ISPRS Ann. Photogramm. Remote Sens. Spatial Inf. Sci.},
  date = {2017-08-18},
  pages = {49-57},
  author = {Müller, M. S. and Urban, S. and Jutzi, B.},
  file = {/Users/abdullah/Zotero/storage/XENDKWHE/Müller et al. - 2017 - SQUEEZEPOSENET IMAGE BASED POSE REGRESSION WITH S.pdf}
}
% == BibLateX quality report for mullerSQUEEZEPOSENETIMAGEBASED2017:
% ? Possibly abbreviated journal title ISPRS Ann. Photogramm. Remote Sens. Spatial Inf. Sci.
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{valadaDeepAuxiliaryLearning2018,
  langid = {english},
  location = {{Brisbane, QLD}},
  title = {Deep {{Auxiliary Learning}} for {{Visual Localization}} and {{Odometry}}},
  isbn = {978-1-5386-3081-5},
  doi = {10.1109/ICRA.2018.8462979},
  abstract = {Localization is an indispensable component of a robot’s autonomy stack that enables it to determine where it is in the environment, essentially making it a precursor for any action execution or planning. Although convolutional neural networks have shown promising results for visual localization, they are still grossly outperformed by state-of-the-art local feature-based techniques. In this work, we propose VLocNet, a new convolutional neural network architecture for 6-DoF global pose regression and odometry estimation from consecutive monocular images. Our multitask model incorporates hard parameter sharing, thus being compact and enabling real-time inference, in addition to being end-to-end trainable. We propose a novel loss function that utilizes auxiliary learning to leverage relative pose information during training, thereby constraining the search space to obtain consistent pose estimates. We evaluate our proposed VLocNet on indoor as well as outdoor datasets and show that even our single task model exceeds the performance of state-of-the-art deep architectures for global localization, while achieving competitive performance for visual odometry estimation. Furthermore, we present extensive experimental evaluations utilizing our proposed Geometric Consistency Loss that show the effectiveness of multitask learning and demonstrate that our model is the ﬁrst deep learning technique to be on par with, and in some cases outperforms state-of-theart SIFT-based approaches.},
  eventtitle = {2018 {{IEEE International Conference}} on {{Robotics}} and {{Automation}} ({{ICRA}})},
  booktitle = {2018 {{IEEE International Conference}} on {{Robotics}} and {{Automation}} ({{ICRA}})},
  publisher = {{IEEE}},
  date = {2018-05},
  pages = {6939-6946},
  author = {Valada, Abhinav and Radwan, Noha and Burgard, Wolfram},
  file = {/Users/abdullah/Zotero/storage/CMRCM9V2/Valada et al. - 2018 - Deep Auxiliary Learning for Visual Localization an.pdf}
}
% == BibLateX quality report for valadaDeepAuxiliaryLearning2018:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@article{valadaIncorporatingSemanticGeometric2018,
  langid = {english},
  title = {Incorporating {{Semantic}} and {{Geometric Priors}} in {{Deep Pose Regression}}},
  url = {http://ais.informatik.uni-freiburg.de/publications/papers/valada18rsslair.pdf},
  abstract = {Deep learning has enabled recent breakthroughs across a wide spectrum of scene understanding tasks, however, its applicability to camera pose regression has been unfruitful due to the direct formulation that renders it incapable of encoding scene-speciﬁc constrains. In this work, we propose the VLocNet++ architecture that overcomes this limitation by simultaneously embedding geometric and semantic knowledge of the world into the pose regression network. We employ a multitask learning approach to exploit the inter-task relationship between learning semantics, regressing 6-DoF global pose and odometry for the mutual beneﬁt of each of these tasks. Furthermore, in order to enforce global consistency during camera pose regression, we propose the novel Geometric Consistency Loss function that leverages the predicted relative motion estimated from odometry to constrict the search space while training. Extensive experiments on the challenging Microsoft 7-Scenes benchmark and our DeepLoc dataset demonstrate that our approach exceeds the state-of-the-art outperforming local feature-based methods while simultaneously performing multiple tasks and exhibiting substantial robustness in challenging scenarios.},
  date = {2018},
  pages = {4},
  author = {Valada, Abhinav and Radwan, Noha and Burgard, Wolfram},
  file = {/Users/abdullah/Zotero/storage/CL7CDTMD/Valada et al. - Incorporating Semantic and Geometric Priors in Dee.pdf}
}
% == BibLateX quality report for valadaIncorporatingSemanticGeometric2018:
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@article{heDeepResidualLearning2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1512.03385},
  primaryClass = {cs},
  langid = {english},
  title = {Deep {{Residual Learning}} for {{Image Recognition}}},
  url = {http://arxiv.org/abs/1512.03385},
  abstract = {Deeper neural networks are more difﬁcult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers—8× deeper than VGG nets [41] but still having lower complexity. An ensemble of these residual nets achieves 3.57\% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classiﬁcation task. We also present analysis on CIFAR-10 with 100 and 1000 layers.},
  urldate = {2019-06-18},
  date = {2015-12-10},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}
}
% == BibLateX quality report for heDeepResidualLearning2015:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@incollection{liWorldwidePoseEstimation2012,
  langid = {english},
  location = {{Berlin, Heidelberg}},
  title = {Worldwide {{Pose Estimation Using 3D Point Clouds}}},
  volume = {7572},
  isbn = {978-3-642-33717-8 978-3-642-33718-5},
  abstract = {We address the problem of determining where a photo was taken by estimating a full 6-DOF-plus-intrincs camera pose with respect to a large geo-registered 3D point cloud, bringing together research on image localization, landmark recognition, and 3D pose estimation. Our method scales to datasets with hundreds of thousands of images and tens of millions of 3D points through the use of two new techniques: a co-occurrence prior for RANSAC and bidirectional matching of image features with 3D points. We evaluate our method on several large data sets, and show state-of-the-art results on landmark recognition as well as the ability to locate cameras to within meters, requiring only seconds per query.},
  booktitle = {Computer {{Vision}} – {{ECCV}} 2012},
  publisher = {{Springer Berlin Heidelberg}},
  date = {2012},
  pages = {15-29},
  author = {Li, Yunpeng and Snavely, Noah and Huttenlocher, Dan and Fua, Pascal},
  editor = {Fitzgibbon, Andrew and Lazebnik, Svetlana and Perona, Pietro and Sato, Yoichi and Schmid, Cordelia},
  editorb = {Hutchison, David and Kanade, Takeo and Kittler, Josef and Kleinberg, Jon M. and Mattern, Friedemann and Mitchell, John C. and Naor, Moni and Nierstrasz, Oscar and Pandu Rangan, C. and Steffen, Bernhard and Sudan, Madhu and Terzopoulos, Demetri and Tygar, Doug and Vardi, Moshe Y. and Weikum, Gerhard},
  editorbtype = {redactor},
  file = {/Users/abdullah/Zotero/storage/XV7SJYPL/Li et al. - 2012 - Worldwide Pose Estimation Using 3D Point Clouds.pdf},
  doi = {10.1007/978-3-642-33718-5_2}
}
% == BibLateX quality report for liWorldwidePoseEstimation2012:
% 'isbn': not a valid ISBN
% ? Title looks like it was stored in title-case in Zotero

@article{shafaeiPlayLearnUsing2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1608.01745},
  primaryClass = {cs},
  title = {Play and {{Learn}}: {{Using Video Games}} to {{Train Computer Vision Models}}},
  url = {http://arxiv.org/abs/1608.01745},
  shorttitle = {Play and {{Learn}}},
  abstract = {Video games are a compelling source of annotated data as they can readily provide fine-grained groundtruth for diverse tasks. However, it is not clear whether the synthetically generated data has enough resemblance to the real-world images to improve the performance of computer vision models in practice. We present experiments assessing the effectiveness on real-world data of systems trained on synthetic RGB images that are extracted from a video game. We collected over 60000 synthetic samples from a modern video game with similar conditions to the real-world CamVid and Cityscapes datasets. We provide several experiments to demonstrate that the synthetically generated RGB images can be used to improve the performance of deep neural networks on both image segmentation and depth estimation. These results show that a convolutional network trained on synthetic data achieves a similar test error to a network that is trained on real-world data for dense image classification. Furthermore, the synthetically generated RGB images can provide similar or better results compared to the real-world datasets if a simple domain adaptation technique is applied. Our results suggest that collaboration with game developers for an accessible interface to gather data is potentially a fruitful direction for future work in computer vision.},
  urldate = {2019-06-19},
  date = {2016-08-04},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Shafaei, Alireza and Little, James J. and Schmidt, Mark},
  file = {/Users/abdullah/Zotero/storage/DZRG8XBF/Shafaei et al. - 2016 - Play and Learn Using Video Games to Train Compute.pdf;/Users/abdullah/Zotero/storage/28ZWJJDA/1608.html}
}
% == BibLateX quality report for shafaeiPlayLearnUsing2016:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{dosovitskiyFlowNetLearningOptical2015,
  langid = {english},
  location = {{Santiago}},
  title = {{{FlowNet}}: {{Learning Optical Flow}} with {{Convolutional Networks}}},
  isbn = {978-1-4673-8391-2},
  doi = {10.1109/ICCV.2015.316},
  shorttitle = {{{FlowNet}}},
  abstract = {Convolutional neural networks (CNNs) have recently been very successful in a variety of computer vision tasks, especially on those linked to recognition. Optical ﬂow estimation has not been among the tasks CNNs succeeded at. In this paper we construct CNNs which are capable of solving the optical ﬂow estimation problem as a supervised learning task. We propose and compare two architectures: a generic architecture and another one including a layer that correlates feature vectors at different image locations. Since existing ground truth data sets are not sufﬁciently large to train a CNN, we generate a large synthetic Flying Chairs dataset. We show that networks trained on this unrealistic data still generalize very well to existing datasets such as Sintel and KITTI, achieving competitive accuracy at frame rates of 5 to 10 fps.},
  eventtitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  booktitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  publisher = {{IEEE}},
  date = {2015-12},
  pages = {2758-2766},
  author = {Dosovitskiy, Alexey and Fischer, Philipp and Ilg, Eddy and Hausser, Philip and Hazirbas, Caner and Golkov, Vladimir and van der Smagt, Patrick and Cremers, Daniel and Brox, Thomas},
  file = {/Users/abdullah/Zotero/storage/686IXTAG/Dosovitskiy et al. - 2015 - FlowNet Learning Optical Flow with Convolutional .pdf}
}
% == BibLateX quality report for dosovitskiyFlowNetLearningOptical2015:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@article{simonyanVeryDeepConvolutional2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1409.1556},
  primaryClass = {cs},
  title = {Very {{Deep Convolutional Networks}} for {{Large}}-{{Scale Image Recognition}}},
  url = {http://arxiv.org/abs/1409.1556},
  abstract = {In this work we investigate the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting. Our main contribution is a thorough evaluation of networks of increasing depth using an architecture with very small (3x3) convolution filters, which shows that a significant improvement on the prior-art configurations can be achieved by pushing the depth to 16-19 weight layers. These findings were the basis of our ImageNet Challenge 2014 submission, where our team secured the first and the second places in the localisation and classification tracks respectively. We also show that our representations generalise well to other datasets, where they achieve state-of-the-art results. We have made our two best-performing ConvNet models publicly available to facilitate further research on the use of deep visual representations in computer vision.},
  urldate = {2019-06-19},
  date = {2014-09-04},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Simonyan, Karen and Zisserman, Andrew},
  file = {/Users/abdullah/Zotero/storage/G8U732PD/Simonyan and Zisserman - 2014 - Very Deep Convolutional Networks for Large-Scale I.pdf;/Users/abdullah/Zotero/storage/CCMKBL6U/1409.html}
}
% == BibLateX quality report for simonyanVeryDeepConvolutional2014:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@article{kokkinosPushingBoundariesBoundary2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1511.07386},
  primaryClass = {cs},
  title = {Pushing the {{Boundaries}} of {{Boundary Detection}} Using {{Deep Learning}}},
  url = {http://arxiv.org/abs/1511.07386},
  abstract = {In this work we show that adapting Deep Convolutional Neural Network training to the task of boundary detection can result in substantial improvements over the current state-of-the-art in boundary detection. Our contributions consist firstly in combining a careful design of the loss for boundary detection training, a multi-resolution architecture and training with external data to improve the detection accuracy of the current state of the art. When measured on the standard Berkeley Segmentation Dataset, we improve theoptimal dataset scale F-measure from 0.780 to 0.808 - while human performance is at 0.803. We further improve performance to 0.813 by combining deep learning with grouping, integrating the Normalized Cuts technique within a deep network. We also examine the potential of our boundary detector in conjunction with the task of semantic segmentation and demonstrate clear improvements over state-of-the-art systems. Our detector is fully integrated in the popular Caffe framework and processes a 320x420 image in less than a second.},
  urldate = {2019-06-20},
  date = {2015-11-23},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
  author = {Kokkinos, Iasonas},
  file = {/Users/abdullah/Zotero/storage/QYPZQT8E/Kokkinos - 2015 - Pushing the Boundaries of Boundary Detection using.pdf;/Users/abdullah/Zotero/storage/Y2CYWIME/1511.html}
}
% == BibLateX quality report for kokkinosPushingBoundariesBoundary2015:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'

@article{maninisConvolutionalOrientedBoundaries2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1608.02755},
  primaryClass = {cs},
  title = {Convolutional {{Oriented Boundaries}}},
  volume = {9905},
  doi = {10.1007/978-3-319-46448-0_35},
  abstract = {We present Convolutional Oriented Boundaries (COB), which produces multiscale oriented contours and region hierarchies starting from generic image classification Convolutional Neural Networks (CNNs). COB is computationally efficient, because it requires a single CNN forward pass for contour detection and it uses a novel sparse boundary representation for hierarchical segmentation; it gives a significant leap in performance over the state-of-the-art, and it generalizes very well to unseen categories and datasets. Particularly, we show that learning to estimate not only contour strength but also orientation provides more accurate results. We perform extensive experiments on BSDS, PASCAL Context, PASCAL Segmentation, and MS-COCO, showing that COB provides state-of-the-art contours, region hierarchies, and object proposals in all datasets.},
  date = {2016},
  pages = {580-596},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Maninis, Kevis-Kokitsi and Pont-Tuset, Jordi and Arbeláez, Pablo and Van Gool, Luc},
  file = {/Users/abdullah/Zotero/storage/6B4F8CGV/Maninis et al. - 2016 - Convolutional Oriented Boundaries.pdf;/Users/abdullah/Zotero/storage/2KDHEAIW/1608.html}
}
% == BibLateX quality report for maninisConvolutionalOrientedBoundaries2016:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@incollection{hazirbasFuseNetIncorporatingDepth2017,
  langid = {english},
  location = {{Cham}},
  title = {{{FuseNet}}: {{Incorporating Depth}} into {{Semantic Segmentation}} via {{Fusion}}-{{Based CNN Architecture}}},
  volume = {10111},
  isbn = {978-3-319-54180-8 978-3-319-54181-5},
  shorttitle = {{{FuseNet}}},
  abstract = {In this paper we address the problem of semantic labeling of indoor scenes on RGB-D data. With the availability of RGB-D cameras, it is expected that additional depth measurement will improve the accuracy. Here we investigate a solution how to incorporate complementary depth information into a semantic segmentation framework by making use of convolutional neural networks (CNNs). Recently encoder-decoder type fully convolutional CNN architectures have achieved a great success in the ﬁeld of semantic segmentation. Motivated by this observation we propose an encoder-decoder type network, where the encoder part is composed of two branches of networks that simultaneously extract features from RGB and depth images and fuse depth features into the RGB feature maps as the network goes deeper. Comprehensive experimental evaluations demonstrate that the proposed fusion-based architecture achieves competitive results with the state-of-the-art methods on the challenging SUN RGB-D benchmark obtaining 76.27\% global accuracy, 48.30\% average class accuracy and 37.29\% average intersectionover-union score.},
  booktitle = {Computer {{Vision}} – {{ACCV}} 2016},
  publisher = {{Springer International Publishing}},
  date = {2017},
  pages = {213-228},
  author = {Hazirbas, Caner and Ma, Lingni and Domokos, Csaba and Cremers, Daniel},
  editor = {Lai, Shang-Hong and Lepetit, Vincent and Nishino, Ko and Sato, Yoichi},
  file = {/Users/abdullah/Zotero/storage/AJCMHWVQ/Hazirbas et al. - 2017 - FuseNet Incorporating Depth into Semantic Segment.pdf;/Users/abdullah/Zotero/storage/C8PYFS7H/Hazirbas et al. - 2017 - FuseNet Incorporating Depth into Semantic Segment.pdf},
  doi = {10.1007/978-3-319-54181-5_14}
}
% == BibLateX quality report for hazirbasFuseNetIncorporatingDepth2017:
% 'isbn': not a valid ISBN
% ? Title looks like it was stored in title-case in Zotero

@article{laskarCameraRelocalizationComputing2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1707.09733},
  primaryClass = {cs},
  title = {Camera {{Relocalization}} by {{Computing Pairwise Relative Poses Using Convolutional Neural Network}}},
  url = {http://arxiv.org/abs/1707.09733},
  abstract = {We propose a new deep learning based approach for camera relocalization. Our approach localizes a given query image by using a convolutional neural network (CNN) for first retrieving similar database images and then predicting the relative pose between the query and the database images, whose poses are known. The camera location for the query image is obtained via triangulation from two relative translation estimates using a RANSAC based approach. Each relative pose estimate provides a hypothesis for the camera orientation and they are fused in a second RANSAC scheme. The neural network is trained for relative pose estimation in an end-to-end manner using training image pairs. In contrast to previous work, our approach does not require scene-specific training of the network, which improves scalability, and it can also be applied to scenes which are not available during the training of the network. As another main contribution, we release a challenging indoor localisation dataset covering 5 different scenes registered to a common coordinate frame. We evaluate our approach using both our own dataset and the standard 7 Scenes benchmark. The results show that the proposed approach generalizes well to previously unseen scenes and compares favourably to other recent CNN-based methods.},
  urldate = {2019-06-20},
  date = {2017-07-31},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Laskar, Zakaria and Melekhov, Iaroslav and Kalia, Surya and Kannala, Juho},
  file = {/Users/abdullah/Zotero/storage/HD73T96I/Laskar et al. - 2017 - Camera Relocalization by Computing Pairwise Relati.pdf;/Users/abdullah/Zotero/storage/4CPURF48/1707.html}
}
% == BibLateX quality report for laskarCameraRelocalizationComputing2017:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@article{hochreiterLongShortTermMemory1997,
  title = {Long {{Short}}-{{Term Memory}}},
  volume = {9},
  issn = {0899-7667},
  doi = {10.1162/neco.1997.9.8.1735},
  abstract = {Learning to store information over extended time intervals by recurrent backpropagation takes a very long time, mostly because of insufficient, decaying error backflow. We briefly review Hochreiter's (1991) analysis of this problem, then address it by introducing a novel, efficient, gradient based method called long short-term memory (LSTM). Truncating the gradient where this does not do harm, LSTM can learn to bridge minimal time lags in excess of 1000 discrete-time steps by enforcing constant error flow through constant error carousels within special units. Multiplicative gate units learn to open and close access to the constant error flow. LSTM is local in space and time; its computational complexity per time step and weight is O. 1. Our experiments with artificial data involve local, distributed, real-valued, and noisy pattern representations. In comparisons with real-time recurrent learning, back propagation through time, recurrent cascade correlation, Elman nets, and neural sequence chunking, LSTM leads to many more successful runs, and learns much faster. LSTM also solves complex, artificial long-time-lag tasks that have never been solved by previous recurrent network algorithms.},
  number = {8},
  journaltitle = {Neural Computation},
  date = {1997-11-01},
  pages = {1735-1780},
  author = {Hochreiter, Sepp and Schmidhuber, Jürgen},
  file = {/Users/abdullah/Zotero/storage/5X4CPGZB/Hochreiter and Schmidhuber - 1997 - Long Short-Term Memory.pdf;/Users/abdullah/Zotero/storage/U3CFH9XI/neco.1997.9.8.html}
}
% == BibLateX quality report for hochreiterLongShortTermMemory1997:
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{naseerDeepRegressionMonocular2017,
  langid = {english},
  location = {{Vancouver, BC}},
  title = {Deep Regression for Monocular Camera-Based 6-{{DoF}} Global Localization in Outdoor Environments},
  isbn = {978-1-5386-2682-5},
  doi = {10.1109/IROS.2017.8205957},
  abstract = {Precise localization of robots is imperative for their safe and autonomous navigation in both indoor and outdoor environments. In outdoor scenarios, the environment typically undergoes signiﬁcant perceptual changes and requires robust methods for accurate localization. Monocular camerabased approaches provide an inexpensive solution to such challenging problems compared to 3D LiDAR-based methods. Recently, approaches have leveraged deep convolutional neural networks (CNNs) to perform place recognition and they turn out to outperform traditional handcrafted features under challenging perceptual conditions. In this paper, we propose an approach for directly regressing a 6-DoF camera pose using CNNs and a single monocular RGB image. We leverage the idea of transfer learning for training our network as this technique has shown to perform better when the number of training samples are not very high. Furthermore, we propose novel data augmentation in 3D space for additional pose coverage which leads to more accurate localization. In contrast to the traditional visual metric localization approaches, our resulting map size is constant with respect to the database. During localization, our approach has a constant time complexity of O(1) and is independent of the database size and runs in real-time at∼80 Hz using a single GPU. We show the localization accuracy of our approach on publicly available datasets and that it outperforms CNN-based state-of-the-art methods.},
  eventtitle = {2017 {{IEEE}}/{{RSJ International Conference}} on {{Intelligent Robots}} and {{Systems}} ({{IROS}})},
  booktitle = {2017 {{IEEE}}/{{RSJ International Conference}} on {{Intelligent Robots}} and {{Systems}} ({{IROS}})},
  publisher = {{IEEE}},
  date = {2017-09},
  pages = {1525-1530},
  author = {Naseer, Tayyab and Burgard, Wolfram},
  file = {/Users/abdullah/Zotero/storage/3GBAWXXD/Naseer and Burgard - 2017 - Deep regression for monocular camera-based 6-DoF g.pdf}
}
% == BibLateX quality report for naseerDeepRegressionMonocular2017:
% ? Unsure about the formatting of the booktitle

@article{iandolaSqueezeNetAlexNetlevelAccuracy2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1602.07360},
  primaryClass = {cs},
  title = {{{SqueezeNet}}: {{AlexNet}}-Level Accuracy with 50x Fewer Parameters and {$<$}0.{{5MB}} Model Size},
  url = {http://arxiv.org/abs/1602.07360},
  shorttitle = {{{SqueezeNet}}},
  abstract = {Recent research on deep neural networks has focused primarily on improving accuracy. For a given accuracy level, it is typically possible to identify multiple DNN architectures that achieve that accuracy level. With equivalent accuracy, smaller DNN architectures offer at least three advantages: (1) Smaller DNNs require less communication across servers during distributed training. (2) Smaller DNNs require less bandwidth to export a new model from the cloud to an autonomous car. (3) Smaller DNNs are more feasible to deploy on FPGAs and other hardware with limited memory. To provide all of these advantages, we propose a small DNN architecture called SqueezeNet. SqueezeNet achieves AlexNet-level accuracy on ImageNet with 50x fewer parameters. Additionally, with model compression techniques we are able to compress SqueezeNet to less than 0.5MB (510x smaller than AlexNet). The SqueezeNet architecture is available for download here: https://github.com/DeepScale/SqueezeNet},
  urldate = {2019-06-21},
  date = {2016-02-23},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Artificial Intelligence},
  author = {Iandola, Forrest N. and Han, Song and Moskewicz, Matthew W. and Ashraf, Khalid and Dally, William J. and Keutzer, Kurt},
  file = {/Users/abdullah/Zotero/storage/7FZ4MCAM/Iandola et al. - 2016 - SqueezeNet AlexNet-level accuracy with 50x fewer .pdf;/Users/abdullah/Zotero/storage/VDHWAPDX/1602.html}
}
% == BibLateX quality report for iandolaSqueezeNetAlexNetlevelAccuracy2016:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'

@inproceedings{huitlTUMindoorExtensiveImage2012,
  langid = {english},
  location = {{Orlando, FL, USA}},
  title = {{{TUMindoor}}: {{An}} Extensive Image and Point Cloud Dataset for Visual Indoor Localization and Mapping},
  isbn = {978-1-4673-2533-2 978-1-4673-2534-9 978-1-4673-2532-5},
  doi = {10.1109/ICIP.2012.6467224},
  shorttitle = {{{TUMindoor}}},
  abstract = {Recent advances in the ﬁeld of content-based image retrieval (CBIR) have made it possible to quickly search large image databases using photographs or video sequences as a query. With appropriately tagged images of places, this technique can be applied to the problem of visual location recognition. While this task has attracted large interest in the community, most existing approaches focus on outdoor environments only. This is mainly due to the fact that the generation of an indoor dataset is elaborate and complex. In order to allow researchers to advance their approaches towards the challenging ﬁeld of CBIR-based indoor localization and to facilitate an objective comparison of different algorithms, we provide an extensive, high resolution indoor dataset. The free for use dataset includes realistic query sequences with ground truth as well as point cloud data, enabling a localization system to perform 6-DOF pose estimation.},
  eventtitle = {2012 19th {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}} 2012)},
  booktitle = {2012 19th {{IEEE International Conference}} on {{Image Processing}}},
  publisher = {{IEEE}},
  date = {2012-09},
  pages = {1773-1776},
  author = {Huitl, R. and Schroth, G. and Hilsenbeck, S. and Schweiger, F. and Steinbach, E.},
  file = {/Users/abdullah/Zotero/storage/DDKQ73WT/Huitl et al. - 2012 - TUMindoor An extensive image and point cloud data.pdf}
}
% == BibLateX quality report for huitlTUMindoorExtensiveImage2012:
% 'isbn': not a valid ISBN
% ? Unsure about the formatting of the booktitle

@book{izadiKinectFusionRealtime3D2011,
  langid = {american},
  title = {{{KinectFusion}}: {{Real}}-Time {{3D Reconstruction}} and {{Interaction Using}} a {{Moving Depth Camera}}},
  isbn = {978-1-4503-0716-1},
  shorttitle = {{{KinectFusion}}},
  abstract = {KinectFusion enables a user holding and moving a standard Kinect camera to rapidly create detailed 3D reconstructions of an indoor scene. Only the depth data from Kinect is used to track the 3D pose of the sensor and reconstruct, geometrically precise, 3D models of the physical scene in real-time. The capabilities of KinectFusion, as well …},
  date = {2011-10-01},
  author = {Izadi, Shahram and Kim, David and Hilliges, Otmar and Molyneaux, David and Newcombe, Richard and Kohli, Pushmeet and Shotton, Jamie and Hodges, Steve and Freeman, Dustin and Davison, Andrew and Fitzgibbon, Andrew},
  file = {/Users/abdullah/Zotero/storage/R45KEBV7/Izadi et al. - 2011 - KinectFusion Real-time 3D Reconstruction and Inte.pdf;/Users/abdullah/Zotero/storage/QPDYI2TB/kinectfusion-real-time-3d-reconstruction-and-interaction-using-a-moving-depth-camera.html},
  doi = {10.1145/2047196.2047270}
}

@incollection{denkerTransformingNeuralNetOutput1991,
  title = {Transforming {{Neural}}-{{Net Output Levels}} to {{Probability Distributions}}},
  url = {http://papers.nips.cc/paper/419-transforming-neural-net-output-levels-to-probability-distributions.pdf},
  booktitle = {Advances in {{Neural Information Processing Systems}} 3},
  publisher = {{Morgan-Kaufmann}},
  urldate = {2019-06-21},
  date = {1991},
  pages = {853--859},
  author = {Denker, John S. and LeCun, Yann},
  editor = {Lippmann, R. P. and Moody, J. E. and Touretzky, D. S.},
  file = {/Users/abdullah/Zotero/storage/FRC434F5/Denker and LeCun - 1991 - Transforming Neural-Net Output Levels to Probabili.pdf;/Users/abdullah/Zotero/storage/D8BZC8UF/419-transforming-neural-net-output-levels-to-probability-distributions.html}
}
% == BibLateX quality report for denkerTransformingNeuralNetOutput1991:
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{pishchulinArticulatedPeopleDetection2012a,
  title = {Articulated People Detection and Pose Estimation: {{Reshaping}} the Future},
  doi = {10.1109/CVPR.2012.6248052},
  shorttitle = {Articulated People Detection and Pose Estimation},
  abstract = {State-of-the-art methods for human detection and pose estimation require many training samples for best performance. While large, manually collected datasets exist, the captured variations w.r.t. appearance, shape and pose are often uncontrolled thus limiting the overall performance. In order to overcome this limitation we propose a new technique to extend an existing training set that allows to explicitly control pose and shape variations. For this we build on recent advances in computer graphics to generate samples with realistic appearance and background while modifying body shape and pose. We validate the effectiveness of our approach on the task of articulated human detection and articulated pose estimation. We report close to state of the art results on the popular Image Parsing [25] human pose estimation benchmark and demonstrate superior performance for articulated human detection. In addition we define a new challenge of combined articulated human detection and pose estimation in real-world scenes.},
  eventtitle = {2012 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  booktitle = {2012 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  date = {2012-06},
  pages = {3178-3185},
  keywords = {Training,Solid modeling,articulated human detection,articulated people detection,articulated pose estimation,body shape,computer graphics,Estimation,human pose estimation benchmark,Humans,image parsing,IP networks,Joints,pose estimation,real-world scenes,realistic appearance,Shape,shape variations},
  author = {Pishchulin, L. and Jain, A. and Andriluka, M. and Thormählen, T. and Schiele, B.},
  file = {/Users/abdullah/Zotero/storage/4FGI6WMC/Pishchulin et al. - 2012 - Articulated people detection and pose estimation .pdf;/Users/abdullah/Zotero/storage/CL7T8WUW/6248052.html}
}
% == BibLateX quality report for pishchulinArticulatedPeopleDetection2012a:
% ? Unsure about the formatting of the booktitle

@article{fanelloLearningBeDepth2014,
  langid = {english},
  title = {Learning to Be a Depth Camera for Close-Range Human Capture and Interaction},
  volume = {33},
  issn = {07300301},
  doi = {10.1145/2601097.2601223},
  number = {4},
  journaltitle = {ACM Trans. Graph.},
  date = {2014-07-27},
  pages = {1-11},
  author = {Fanello, Sean Ryan and Paek, Tim and Keskin, Cem and Izadi, Shahram and Kohli, Pushmeet and Kim, David and Sweeney, David and Criminisi, Antonio and Shotton, Jamie and Kang, Sing Bing},
  file = {/Users/abdullah/Zotero/storage/EB3P6I87/Fanello et al. - 2014 - Learning to be a depth camera for close-range huma.pdf}
}
% == BibLateX quality report for fanelloLearningBeDepth2014:
% ? Possibly abbreviated journal title ACM Trans. Graph.

@inproceedings{gaidonVirtualWorldsProxyMultiobject2016,
  langid = {english},
  location = {{Las Vegas, NV, USA}},
  title = {{{VirtualWorlds}} as {{Proxy}} for {{Multi}}-Object {{Tracking Analysis}}},
  isbn = {978-1-4673-8851-1},
  doi = {10.1109/CVPR.2016.470},
  abstract = {Modern computer vision algorithms typically require expensive data acquisition and accurate manual labeling. In this work, we instead leverage the recent progress in computer graphics to generate fully labeled, dynamic, and photo-realistic proxy virtual worlds. We propose an efﬁcient real-to-virtual world cloning method, and validate our approach by building and publicly releasing a new video dataset, called “Virtual KITTI” 1, automatically labeled with accurate ground truth for object detection, tracking, scene and instance segmentation, depth, and optical ﬂow. We provide quantitative experimental evidence suggesting that (i) modern deep learning algorithms pre-trained on real data behave similarly in real and virtual worlds, and (ii) pre-training on virtual data improves performance. As the gap between real and virtual worlds is small, virtual worlds enable measuring the impact of various weather and imaging conditions on recognition performance, all other things being equal. We show these factors may affect drastically otherwise high-performing deep models for tracking.},
  eventtitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  date = {2016-06},
  pages = {4340-4349},
  author = {Gaidon, Adrien and Wang, Qiao and Cabon, Yohann and Vig, Eleonora},
  file = {/Users/abdullah/Zotero/storage/YQGTQNMZ/Gaidon et al. - 2016 - VirtualWorlds as Proxy for Multi-object Tracking A.pdf}
}
% == BibLateX quality report for gaidonVirtualWorldsProxyMultiobject2016:
% ? Unsure about the formatting of the booktitle

@inproceedings{brachmannUncertaintyDriven6DPose2016,
  title = {Uncertainty-{{Driven 6D Pose Estimation}} of {{Objects}} and {{Scenes From}} a {{Single RGB Image}}},
  url = {http://openaccess.thecvf.com/content_cvpr_2016/html/Brachmann_Uncertainty-Driven_6D_Pose_CVPR_2016_paper.html},
  eventtitle = {Proceedings of the {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  urldate = {2019-06-23},
  date = {2016-06},
  pages = {3364-3372},
  author = {Brachmann, Eric and Michel, Frank and Krull, Alexander and Ying Yang, Michael and Gumhold, Stefan and Rother, Carsten},
  file = {/Users/abdullah/Zotero/storage/NMUD8REK/Brachmann et al. - 2016 - Uncertainty-Driven 6D Pose Estimation of Objects a.pdf;/Users/abdullah/Zotero/storage/7CNAPXXW/Brachmann_Uncertainty-Driven_6D_Pose_CVPR_2016_paper.html}
}
% == BibLateX quality report for brachmannUncertaintyDriven6DPose2016:
% Missing required field 'booktitle'
% ? Title looks like it was stored in title-case in Zotero

@article{lewContentbasedMultimediaInformation2006,
  langid = {english},
  title = {Content-Based Multimedia Information Retrieval: {{State}} of the Art and Challenges},
  volume = {2},
  issn = {15516857},
  doi = {10.1145/1126004.1126005},
  shorttitle = {Content-Based Multimedia Information Retrieval},
  number = {1},
  journaltitle = {ACM Trans. Multimedia Comput. Commun. Appl.},
  date = {2006-02-01},
  pages = {1-19},
  author = {Lew, Michael S. and Sebe, Nicu and Djeraba, Chabane and Jain, Ramesh},
  file = {/Users/abdullah/Zotero/storage/M9FUQXII/Lew et al. - 2006 - Content-based multimedia information retrieval St.pdf}
}
% == BibLateX quality report for lewContentbasedMultimediaInformation2006:
% ? Possibly abbreviated journal title ACM Trans. Multimedia Comput. Commun. Appl.

@inproceedings{irscharaStructurefrommotionPointClouds2009,
  langid = {english},
  location = {{Miami, FL}},
  title = {From Structure-from-Motion Point Clouds to Fast Location Recognition},
  isbn = {978-1-4244-3992-8},
  doi = {10.1109/CVPR.2009.5206587},
  abstract = {Efﬁcient view registration with respect to a given 3D reconstruction has many applications like inside-out tracking in indoor and outdoor environments, and geo-locating images from large photo collections. We present a fast location recognition technique based on structure from motion point clouds. Vocabulary tree-based indexing of features directly returns relevant fragments of 3D models instead of documents from the images database. Additionally, we propose a compressed 3D scene representation which improves recognition rates while simultaneously reducing the computation time and the memory consumption. The design of our method is based on algorithms that efﬁciently utilize modern graphics processing units to deliver real-time performance for view registration. We demonstrate the approach by matching hand-held outdoor videos to known 3D urban models, and by registering images from online photo collections to the corresponding landmarks.},
  eventtitle = {2009 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition Workshops}} ({{CVPR Workshops}})},
  booktitle = {2009 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  publisher = {{IEEE}},
  date = {2009-06},
  pages = {2599-2606},
  author = {Irschara, Arnold and Zach, Christopher and Frahm, Jan-Michael and Bischof, Horst},
  file = {/Users/abdullah/Zotero/storage/26XPPB3N/Irschara et al. - 2009 - From structure-from-motion point clouds to fast lo.pdf}
}
% == BibLateX quality report for irscharaStructurefrommotionPointClouds2009:
% ? Unsure about the formatting of the booktitle

@book{goodfellowDeepLearning2016,
  title = {Deep {{Learning}}},
  url = {http://www.deeplearningbook.org},
  publisher = {{MIT Press}},
  date = {2016},
  author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},
  file = {/Users/abdullah/Zotero/storage/LC83T63Z/Goodfellow et al. - 2016 - Deep Learning.pdf}
}
% == BibLateX quality report for goodfellowDeepLearning2016:
% ? Title looks like it was stored in title-case in Zotero

@online{johnsonCS231nConvolutionalNeural,
  title = {{{CS231n Convolutional Neural Networks}} for {{Visual Recognition}} ({{Kursunterlagen}} Zur Neuronale {{Netzwerke}})},
  url = {http://cs231n.github.io/neural-networks-1/},
  journaltitle = {Kursunterlagen zur neuronale Netzwerke},
  urldate = {2019-06-29},
  author = {Johnson, Justin and Karpathy, Andrej and Li, Fei-Fei},
  file = {/Users/abdullah/Zotero/storage/98TNWJ9I/neural-networks-1.html}
}
% == BibLateX quality report for johnsonCS231nConvolutionalNeural:
% Unexpected field 'journaltitle'
% Exactly one of 'date' / 'year' must be present

@online{johnsonCS231nConvolutionalNeurala,
  title = {{{CS231n Convolutional Neural Networks}} for {{Visual Recognition}} ({{Kursunterlagen}} Zur {{Convolutional Neural Networks}})},
  url = {http://cs231n.github.io/convolutional-networks/},
  journaltitle = {Kursunterlagen zur Convolutional Neural Networks},
  urldate = {2019-07-01},
  author = {Johnson, Justin and Karpathy, Andrej and Li, Fei-Fei},
  file = {/Users/abdullah/Zotero/storage/ALBPPFXV/convolutional-networks.html}
}
% == BibLateX quality report for johnsonCS231nConvolutionalNeurala:
% Unexpected field 'journaltitle'
% Exactly one of 'date' / 'year' must be present

@article{bauckhageInformedMachineLearning,
  langid = {english},
  title = {Informed {{Machine Learning Through Functional Composition}}},
  url = {http://ceur-ws.org/Vol-2191/paper4.pdf},
  abstract = {Addressing general problems with applied machine learning, we sketch an approach towards informed learning. The general idea is to treat data driven learning not as a parameter estimation problem but as a problem of sequencing predeﬁned operations. We show by means of an example that this allows for incorporating expert knowledge and that it leads to traceable or explainable decision making systems.},
  author = {Bauckhage, C and Ojeda, C and Schucker, J and Sifa, R and Wrobel, S},
  file = {/Users/abdullah/Zotero/storage/R5KHGLDS/Bauckhage et al. - Informed Machine Learning Through Functional Compo.pdf}
}
% == BibLateX quality report for bauckhageInformedMachineLearning:
% Exactly one of 'date' / 'year' must be present
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{xiangPASCALBenchmark3D2014,
  langid = {english},
  location = {{Steamboat Springs, CO, USA}},
  title = {Beyond {{PASCAL}}: {{A}} Benchmark for {{3D}} Object Detection in the Wild},
  isbn = {978-1-4799-4985-4},
  doi = {10.1109/WACV.2014.6836101},
  shorttitle = {Beyond {{PASCAL}}},
  eventtitle = {2014 {{IEEE Winter Conference}} on {{Applications}} of {{Computer Vision}} ({{WACV}})},
  booktitle = {{{IEEE Winter Conference}} on {{Applications}} of {{Computer Vision}}},
  publisher = {{IEEE}},
  date = {2014-03},
  pages = {75-82},
  author = {Xiang, Yu and Mottaghi, Roozbeh and Savarese, Silvio},
  file = {/Users/abdullah/Zotero/storage/Z86D6PJH/Xiang et al. - 2014 - Beyond PASCAL A benchmark for 3D object detection.pdf}
}
% == BibLateX quality report for xiangPASCALBenchmark3D2014:
% ? Unsure about the formatting of the booktitle

@inproceedings{lecunConvolutionalNetworksApplications2010,
  langid = {english},
  location = {{Paris, France}},
  title = {Convolutional Networks and Applications in Vision},
  isbn = {978-1-4244-5308-5},
  doi = {10.1109/ISCAS.2010.5537907},
  abstract = {Intelligent tasks, such as visual perception, auditory perception, and language understanding require the construction of good internal representations of the world (or ”features”), which must be invariant to irrelevant variations of the input while, preserving relevant information. A major question for Machine Learning is how to learn such good features automatically. Convolutional Networks (ConvNets) are a biologicallyinspired trainable architecture that can learn invariant features. Each stage in a ConvNets is composed of a ﬁlter bank, some non-linearities, and feature pooling layers. With multiple stages, a ConvNet can learn multi-level hierarchies of features. While ConvNets have been successfully deployed in many commercial applications from OCR to video surveillance, they require large amounts of labeled training samples. We describe new unsupervised learning algorithms, and new non-linear stages that allow ConvNets to be trained with very few labeled samples. Applications to visual object recognition and vision navigation for off-road mobile robots are described.},
  eventtitle = {2010 {{IEEE International Symposium}} on {{Circuits}} and {{Systems}} - {{ISCAS}} 2010},
  booktitle = {Proceedings of 2010 {{IEEE International Symposium}} on {{Circuits}} and {{Systems}}},
  publisher = {{IEEE}},
  date = {2010-05},
  pages = {253-256},
  author = {LeCun, Yann and Kavukcuoglu, Koray and Farabet, Clement},
  file = {/Users/abdullah/Zotero/storage/4H5YG95U/LeCun et al. - 2010 - Convolutional networks and applications in vision.pdf}
}

@article{deoliveiraSystemBasedArtificial2017,
  title = {A {{System Based}} on {{Artificial Neural Networks}} for {{Automatic Classification}} of {{Hydro}}-Generator {{Stator Windings Partial Discharges}}},
  volume = {16},
  doi = {10.1590/2179-10742017v16i3854},
  abstract = {Partial discharge (PD) monitoring is widely used in rotating machines to evaluate the condition of stator winding insulation, but its practice on a large scale requires the development of intelligent systems that automatically process these measurement data. In this paper, it is proposed a methodology of automatic PD classification in hydro-generator stator windings using neural networks. The database is formed from online PD measurements in hydro-generators in a real setting. Noise filtering techniques are applied to these data. Then, based on the concept of image projection, novel features are extracted from the filtered samples. These features are used as inputs for training several neural networks. The best performance network, obtained using statistical procedures, presents a recognition rate of 98\%.},
  journaltitle = {Journal of Microwaves, Optoelectronics and Electromagnetic Applications},
  date = {2017-09-01},
  pages = {628-645},
  author = {De Oliveira, Rodrigo and Fernandes Araújo, Ramon Cristian and Barros, Fabrício and Paranhos Segundo, Adriano and Zampolo, Ronaldo and Fonseca, Wellington and Dmitriev, Victor and Brasil, Fernando},
  file = {/Users/abdullah/Zotero/storage/NENXNM4R/De Oliveira et al. - 2017 - A System Based on Artificial Neural Networks for A.pdf}
}

@article{yamashitaConvolutionalNeuralNetworks2018,
  langid = {english},
  title = {Convolutional Neural Networks: An Overview and Application in Radiology},
  volume = {9},
  issn = {1869-4101},
  doi = {10.1007/s13244-018-0639-9},
  shorttitle = {Convolutional Neural Networks},
  abstract = {Convolutional neural network (CNN), a class of artificial neural networks that has become dominant in various computer vision tasks, is attracting interest across a variety of domains, including radiology. CNN is designed to automatically and adaptively learn spatial hierarchies of features through backpropagation by using multiple building blocks, such as convolution layers, pooling layers, and fully connected layers. This review article offers a perspective on the basic concepts of CNN and its application to various radiological tasks, and discusses its challenges and future directions in the field of radiology. Two challenges in applying CNN to radiological tasks, small dataset and overfitting, will also be covered in this article, as well as techniques to minimize them. Being familiar with the concepts and advantages, as well as limitations, of CNN is essential to leverage its potential in diagnostic radiology, with the goal of augmenting the performance of radiologists and improving patient care.Key Points • Convolutional neural network is a class of deep learning methods which has become dominant in various computer vision tasks and is attracting interest across a variety of domains, including radiology. • Convolutional neural network is composed of multiple building blocks, such as convolution layers, pooling layers, and fully connected layers, and is designed to automatically and adaptively learn spatial hierarchies of features through a backpropagation algorithm. • Familiarity with the concepts and advantages, as well as limitations, of convolutional neural network is essential to leverage its potential to improve radiologist performance and, eventually, patient care.},
  number = {4},
  journaltitle = {Insights Imaging},
  date = {2018-08-01},
  pages = {611-629},
  keywords = {Convolutional neural network,Deep learning,Machine learning,Medical imaging,Radiology},
  author = {Yamashita, Rikiya and Nishio, Mizuho and Do, Richard Kinh Gian and Togashi, Kaori},
  file = {/Users/abdullah/Zotero/storage/QKK4S2P4/Yamashita et al. - 2018 - Convolutional neural networks an overview and app.pdf}
}

@online{ImageClassificationConvolutional,
  title = {Image {{Classification}} with {{Convolutional Neural Networks}} – My Attempt at the {{NDSB Kaggle Competition}} | {{SAP Blogs}}},
  url = {https://blogs.sap.com/2015/01/14/image-classification-with-convolutional-neural-networks-my-attempt-at-the-ndsb-kaggle-competition/},
  urldate = {2019-07-08},
  file = {/Users/abdullah/Zotero/storage/2Y3ZCVP8/image-classification-with-convolutional-neural-networks-my-attempt-at-the-ndsb-kaggle-competiti.html}
}
% == BibLateX quality report for ImageClassificationConvolutional:
% Exactly one of 'date' / 'year' must be present

@article{linNetworkNetwork2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1312.4400},
  primaryClass = {cs},
  title = {Network {{In Network}}},
  url = {http://arxiv.org/abs/1312.4400},
  abstract = {We propose a novel deep network structure called "Network In Network" (NIN) to enhance model discriminability for local patches within the receptive field. The conventional convolutional layer uses linear filters followed by a nonlinear activation function to scan the input. Instead, we build micro neural networks with more complex structures to abstract the data within the receptive field. We instantiate the micro neural network with a multilayer perceptron, which is a potent function approximator. The feature maps are obtained by sliding the micro networks over the input in a similar manner as CNN; they are then fed into the next layer. Deep NIN can be implemented by stacking mutiple of the above described structure. With enhanced local modeling via the micro network, we are able to utilize global average pooling over feature maps in the classification layer, which is easier to interpret and less prone to overfitting than traditional fully connected layers. We demonstrated the state-of-the-art classification performances with NIN on CIFAR-10 and CIFAR-100, and reasonable performances on SVHN and MNIST datasets.},
  urldate = {2019-07-11},
  date = {2013-12-16},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Lin, Min and Chen, Qiang and Yan, Shuicheng},
  file = {/Users/abdullah/Zotero/storage/INHTDSJ9/Lin et al. - 2013 - Network In Network.pdf;/Users/abdullah/Zotero/storage/R82VQQRW/1312.html}
}
% == BibLateX quality report for linNetworkNetwork2013:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@article{szegedyInceptionv4InceptionResNetImpact2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1602.07261},
  primaryClass = {cs},
  langid = {english},
  title = {Inception-v4, {{Inception}}-{{ResNet}} and the {{Impact}} of {{Residual Connections}} on {{Learning}}},
  url = {http://arxiv.org/abs/1602.07261},
  abstract = {Very deep convolutional networks have been central to the largest advances in image recognition performance in recent years. One example is the Inception architecture that has been shown to achieve very good performance at relatively low computational cost. Recently, the introduction of residual connections in conjunction with a more traditional architecture has yielded state-of-the-art performance in the 2015 ILSVRC challenge; its performance was similar to the latest generation Inception-v3 network. This raises the question of whether there are any beneﬁt in combining the Inception architecture with residual connections. Here we give clear empirical evidence that training with residual connections accelerates the training of Inception networks signiﬁcantly. There is also some evidence of residual Inception networks outperforming similarly expensive Inception networks without residual connections by a thin margin. We also present several new streamlined architectures for both residual and non-residual Inception networks. These variations improve the single-frame recognition performance on the ILSVRC 2012 classiﬁcation task signiﬁcantly. We further demonstrate how proper activation scaling stabilizes the training of very wide residual Inception networks. With an ensemble of three residual and one Inception-v4, we achieve 3.08\% top-5 error on the test set of the ImageNet classiﬁcation (CLS) challenge.},
  urldate = {2019-07-11},
  date = {2016-02-23},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Szegedy, Christian and Ioffe, Sergey and Vanhoucke, Vincent and Alemi, Alex},
  file = {/Users/abdullah/Zotero/storage/KJXZL9D3/Szegedy et al. - 2016 - Inception-v4, Inception-ResNet and the Impact of R.pdf}
}
% == BibLateX quality report for szegedyInceptionv4InceptionResNetImpact2016:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@article{russakovskyImageNetLargeScale2015,
  langid = {english},
  title = {{{ImageNet Large Scale Visual Recognition Challenge}}},
  volume = {115},
  issn = {0920-5691, 1573-1405},
  doi = {10.1007/s11263-015-0816-y},
  number = {3},
  journaltitle = {Int J Comput Vis},
  date = {2015-12},
  pages = {211-252},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,I.4.8,I.5.2},
  author = {Russakovsky, Olga and Deng, Jia and Su, Hao and Krause, Jonathan and Satheesh, Sanjeev and Ma, Sean and Huang, Zhiheng and Karpathy, Andrej and Khosla, Aditya and Bernstein, Michael and Berg, Alexander C. and Fei-Fei, Li},
  file = {/Users/abdullah/Zotero/storage/G62DTGBK/Russakovsky et al. - 2015 - ImageNet Large Scale Visual Recognition Challenge.pdf;/Users/abdullah/Zotero/storage/IZUXHRUY/Russakovsky et al. - 2014 - ImageNet Large Scale Visual Recognition Challenge.pdf;/Users/abdullah/Zotero/storage/3S4TBN72/1409.html}
}
% == BibLateX quality report for russakovskyImageNetLargeScale2015:
% 'issn': not a valid ISSN
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{jiaCaffeConvolutionalArchitecture2014,
  title = {Caffe: {{Convolutional}} Architecture for Fast Feature Embedding},
  doi = {10.1145/2647868.2654889},
  shorttitle = {Caffe},
  booktitle = {Proceedings of the 22nd {{ACM}} International Conference on {{Multimedia}}},
  publisher = {{ACM}},
  date = {2014},
  pages = {675--678},
  author = {Jia, Yangqing and Shelhamer, Evan and Donahue, Jeff and Karayev, Sergey and Long, Jonathan and Girshick, Ross and Guadarrama, Sergio and Darrell, Trevor},
  file = {/Users/abdullah/Zotero/storage/RYJGACYK/Jia et al. - 2014 - Caffe Convolutional architecture for fast feature.pdf;/Users/abdullah/Zotero/storage/XBHWQIEQ/citation.html}
}
% == BibLateX quality report for jiaCaffeConvolutionalArchitecture2014:
% ? Unsure about the formatting of the booktitle

@article{duchiAdaptiveSubgradientMethods2011,
  title = {Adaptive {{Subgradient Methods}} for {{Online Learning}} and {{Stochastic Optimization}}},
  volume = {12},
  issn = {ISSN 1533-7928},
  url = {http://www.jmlr.org/papers/v12/duchi11a.html},
  issue = {Jul},
  journaltitle = {J. Mach. Learn. Res.},
  urldate = {2019-07-22},
  date = {2011},
  pages = {2121-2159},
  author = {Duchi, John and Hazan, Elad and Singer, Yoram},
  file = {/Users/abdullah/Zotero/storage/SGNB2JLF/Duchi et al. - 2011 - Adaptive Subgradient Methods for Online Learning a.pdf;/Users/abdullah/Zotero/storage/E95RYBSU/duchi11a.html}
}
% == BibLateX quality report for duchiAdaptiveSubgradientMethods2011:
% 'issn': not a valid ISSN
% ? Possibly abbreviated journal title J. Mach. Learn. Res.
% ? Title looks like it was stored in title-case in Zotero

@article{sattlerUnderstandingLimitationsCNNBased,
  langid = {english},
  title = {Understanding the {{Limitations}} of {{CNN}}-{{Based Absolute Camera Pose Regression}}},
  abstract = {Visual localization is the task of accurate camera pose estimation in a known scene. It is a key problem in computer vision and robotics, with applications including selfdriving cars, Structure-from-Motion, SLAM, and Mixed Reality. Traditionally, the localization problem has been tackled using 3D geometry. Recently, end-to-end approaches based on convolutional neural networks have become popular. These methods learn to directly regress the camera pose from an input image. However, they do not achieve the same level of pose accuracy as 3D structure-based methods. To understand this behavior, we develop a theoretical model for camera pose regression. We use our model to predict failure cases for pose regression techniques and verify our predictions through experiments. We furthermore use our model to show that pose regression is more closely related to pose approximation via image retrieval than to accurate pose estimation via 3D structure. A key result is that current approaches do not consistently outperform a handcrafted image retrieval baseline. This clearly shows that additional research is needed before pose regression algorithms are ready to compete with structure-based methods.},
  pages = {11},
  author = {Sattler, Torsten and Zhou, Qunjie and Pollefeys, Marc and Leal-Taixe, Laura},
  file = {/Users/abdullah/Zotero/storage/XPRJU82C/Sattler et al. - Understanding the Limitations of CNN-Based Absolut.pdf}
}
% == BibLateX quality report for sattlerUnderstandingLimitationsCNNBased:
% Exactly one of 'date' / 'year' must be present
% Missing required field 'journaltitle'
% ? Title looks like it was stored in title-case in Zotero

@article{kroppModelbasedPoseEstimation2016,
  title = {Model-Based Pose Estimation for Visual Indoor Progress Monitoring Using Line Features},
  url = {https://www.inf.bi.ruhr-uni-bochum.de/index.php?option=com_publikationen&Itemid=129&seite=details&ID=757&lang=de},
  journaltitle = {Nobuyoshi Yabuki Koji Makanae},
  series = {Proceedings of the 16th {{International Conference}} on {{Computing}} in {{Civil}} and {{Building Engineering}}},
  date = {2016-07},
  author = {Kropp, Christopher and Koch, Christian and König, Markus}
}

@article{kochNaturalMarkersAugmented2014,
  title = {Natural Markers for Augmented Reality-Based Indoor Navigation and Facility Maintenance},
  volume = {48},
  issn = {0926-5805},
  doi = {10.1016/j.autcon.2014.08.009},
  abstract = {The longest phase in a facility's lifecycle is its maintenance period, during which operators perform activities to provide a comfortable living and working environment as well as to upkeep equipment to prevent functional failures. In current practice operators need a considerable amount of time to manually process dispersed and unformatted facility information to perform an actual task. Existing research approaches rely on expensive hardware infrastructure or use artificial, thus unesthetic Augmented Reality (AR) markers. In this paper we present a natural marker based AR framework that can digitally support facility maintenance (FM) operators when navigating to the FM item of interest and when actually performing the maintenance and repair actions. Marker detection performance experiments and case studies on our university campus indicate the feasibility and potential of natural markers for AR-based maintenance support.},
  journaltitle = {Automation in Construction},
  date = {2014-12-01},
  pages = {18-30},
  keywords = {Augmented reality,Facility maintenance,Natural markers,Performance study},
  author = {Koch, Christian and Neges, Matthias and König, Markus and Abramovici, Michael},
  file = {/Users/abdullah/Zotero/storage/W923FJU7/S0926580514001885.html}
}

@inproceedings{stephenseGlobalLocalizationUsing2002,
  langid = {english},
  location = {{Lausanne, Switzerland}},
  title = {Global Localization Using Distinctive Visual Features},
  volume = {1},
  isbn = {978-0-7803-7398-3},
  doi = {10.1109/IRDS.2002.1041393},
  abstract = {We have previously developed a mobile robot system which uses scale invariant visual landmarks to localize and simultaneously build a 3D map of the environment In this paper, we look at global localization, also known as the kidnapped robot problem, where the robot localizes itself globally, without any prior location estimate. This is achieved by matching distinctive landmarks in the current frame to a database map. A Hough Transform approach and a RANSAC approach for global localization are compared, showing that RANSAC is much more eﬃcient. Moreover, robust global localization can be achieved by matching a small sub-map of the local region built from multiple frames.},
  eventtitle = {{{IROS}} 2002: {{IEEE}}/{{RSJ International Conference}} on {{Intelligent Robots}} and {{Systems}}},
  booktitle = {{{IEEE}}/{{RSJ International Conference}} on {{Intelligent Robots}} and {{System}}},
  publisher = {{IEEE}},
  date = {2002},
  pages = {226-231},
  author = {{Stephen Se} and Lowe, D. and Little, J.},
  file = {/Users/abdullah/Zotero/storage/YQPJPPHM/Stephen Se et al. - 2002 - Global localization using distinctive visual featu.pdf}
}
% == BibLateX quality report for stephenseGlobalLocalizationUsing2002:
% ? Unsure about the formatting of the booktitle

@inproceedings{acharyaMODELLINGUNCERTAINTYSINGLE2019a,
  title = {{{MODELLING UNCERTAINTY OF SINGLE IMAGE INDOOR LOCALISATION USING A 3D MODEL AND DEEP LEARNING}}},
  volume = {IV-2/W5},
  doi = {10.5194/isprs-annals-IV-2-W5-247-2019},
  abstract = {Many current indoor localisation approaches need an initial location at the beginning of localisation. The existing visual approaches to indoor localisation perform a 3D reconstruction of the indoor spaces beforehand, for determining this initial location, which is challenging for large indoor spaces. In this research, we present a visual approach for indoor localisation that is eliminating the requirement of any image-based reconstruction of indoor spaces by using a 3D model. A deep Bayesian convolutional neural network is fine-tuned with synthetic images generated from a 3D model to estimate the camera pose of real images. The uncertainty of the estimated camera poses is modelled by sampling the outputs of the Bayesian network fine-tuned with synthetic images. The results of the experiments indicate that a localisation accuracy of approximately 2 metres can be achieved using the proposed approach.},
  eventtitle = {{{ISPRS Annals}} of {{Photogrammetry}}, {{Remote Sensing}} and {{Spatial Information Sciences}}},
  date = {2019-06-11},
  author = {Acharya, Debaditya and Roy, Sesa and Khoshelham, Kourosh and Winter, Stephan},
  file = {/Users/abdullah/Zotero/storage/DD8UAB64/Acharya et al. - 2019 - MODELLING UNCERTAINTY OF SINGLE IMAGE INDOOR LOCAL.pdf}
}
% == BibLateX quality report for acharyaMODELLINGUNCERTAINTYSINGLE2019a:
% Missing required field 'booktitle'
% ? Title looks like it was stored in title-case in Zotero

@incollection{NIPS2014_5349,
  title = {Learning {{Deep Features}} for {{Scene Recognition}} Using {{Places Database}}},
  url = {http://papers.nips.cc/paper/5349-learning-deep-features-for-scene-recognition-using-places-database.pdf},
  booktitle = {Advances in {{Neural Information Processing Systems}} 27},
  publisher = {{Curran Associates, Inc.}},
  date = {2014},
  pages = {487-495},
  author = {Zhou, Bolei and Lapedriza, Agata and Xiao, Jianxiong and Torralba, Antonio and Oliva, Aude},
  editor = {Ghahramani, Z. and Welling, M. and Cortes, C. and Lawrence, N. D. and Weinberger, K. Q.}
}

@inproceedings{zhangImageBasedLocalization2006,
  title = {Image {{Based Localization}} in {{Urban Environments}}},
  doi = {10.1109/3DPVT.2006.80},
  abstract = {In this paper we present a prototype system for image based localization in urban environments. Given a database of views of city street scenes tagged by GPS locations, the system computes the GPS location of a novel query view. We first use a wide-baseline matching technique based on SIFT features to select the closest views in the database. Often due to a large change of viewpoint and presence of repetitive structures, a large percentage of matches ({$>$} 50\%) are not correct correspondences. The subsequent motion estimation between the query view and the reference view, is then handled by a novel and efficient robust estimation technique capable of dealing with large percentage of outliers. This stage is also accompanied by a model selection step among the fundamental matrix and the homography. Once the motion between the closest reference views is estimated, the location of the query view is then obtained by triangulation of translation directions. Approximate solutions for cases when triangulation cannot be obtained reliably are also described. The presented system is tested on the dataset used in ICCV 2005 Computer Vision Contest and is shown to have higher accuracy than previous reported results.},
  eventtitle = {Third {{International Symposium}} on {{3D Data Processing}}, {{Visualization}}, and {{Transmission}} ({{3DPVT}}'06)},
  booktitle = {Third {{International Symposium}} on {{3D Data Processing}}, {{Visualization}}, and {{Transmission}} ({{3DPVT}}'06)},
  date = {2006-06},
  pages = {33-40},
  keywords = {image motion analysis,computer vision,Computer vision,image matching,Cities and towns,city street scenes,Global Positioning System,GPS locations,homography,image based localization,Image databases,Layout,motion estimation,Motion estimation,Prototypes,robust estimation technique,Robustness,SIFT features,Spatial databases,System testing,urban environments,wide-baseline matching technique},
  author = {Zhang, W. and Kosecka, J.},
  file = {/Users/abdullah/Zotero/storage/8DWLALIF/4155707.html}
}
% == BibLateX quality report for zhangImageBasedLocalization2006:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@article{lecunGradientbasedLearningApplied1998,
  title = {Gradient-Based Learning Applied to Document Recognition},
  volume = {86},
  issn = {0018-9219},
  doi = {10.1109/5.726791},
  abstract = {Multilayer neural networks trained with the back-propagation algorithm constitute the best example of a successful gradient based learning technique. Given an appropriate network architecture, gradient-based learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns, such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional neural networks, which are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. Real-life document recognition systems are composed of multiple modules including field extraction, segmentation recognition, and language modeling. A new learning paradigm, called graph transformer networks (GTN), allows such multimodule systems to be trained globally using gradient-based methods so as to minimize an overall performance measure. Two systems for online handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of graph transformer networks. A graph transformer network for reading a bank cheque is also described. It uses convolutional neural network character recognizers combined with global training techniques to provide record accuracy on business and personal cheques. It is deployed commercially and reads several million cheques per day.},
  number = {11},
  journaltitle = {Proc. IEEE},
  date = {1998-11},
  pages = {2278-2324},
  keywords = {convolution,Neural networks,Feature extraction,Machine learning,2D shape variability,back-propagation,backpropagation,Character recognition,cheque reading,complex decision surface synthesis,convolutional neural network character recognizers,document recognition,document recognition systems,field extraction,gradient based learning technique,gradient-based learning,graph transformer networks,GTN,handwritten character recognition,handwritten digit recognition task,Hidden Markov models,high-dimensional patterns,language modeling,Multi-layer neural network,multilayer neural networks,multilayer perceptrons,multimodule systems,optical character recognition,Optical character recognition software,Optical computing,Pattern recognition,performance measure minimization,Principal component analysis,segmentation recognition},
  author = {Lecun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.},
  file = {/Users/abdullah/Zotero/storage/DEABEDT7/726791.html}
}
% == BibLateX quality report for lecunGradientbasedLearningApplied1998:
% ? Possibly abbreviated journal title Proc. IEEE

@article{mackayPracticalBayesianFramework1992,
  title = {A {{Practical Bayesian Framework}} for {{Backpropagation Networks}}},
  volume = {4},
  issn = {0899-7667},
  doi = {10.1162/neco.1992.4.3.448},
  number = {3},
  journaltitle = {Neural Comput},
  date = {1992-05},
  pages = {448--472},
  author = {MacKay, David J. C.},
  file = {/Users/abdullah/Zotero/storage/QIA4SAKL/MacKay - 1992 - A Practical Bayesian Framework for Backpropagation.pdf}
}
% == BibLateX quality report for mackayPracticalBayesianFramework1992:
% ? Title looks like it was stored in title-case in Zotero

@article{melekhovImagebasedLocalizationUsing2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1703.07971},
  primaryClass = {cs},
  title = {Image-Based {{Localization}} Using {{Hourglass Networks}}},
  url = {http://arxiv.org/abs/1703.07971},
  abstract = {In this paper, we propose an encoder-decoder convolutional neural network (CNN) architecture for estimating camera pose (orientation and location) from a single RGB-image. The architecture has a hourglass shape consisting of a chain of convolution and up-convolution layers followed by a regression part. The up-convolution layers are introduced to preserve the fine-grained information of the input image. Following the common practice, we train our model in end-to-end manner utilizing transfer learning from large scale classification data. The experiments demonstrate the performance of the approach on data exhibiting different lighting conditions, reflections, and motion blur. The results indicate a clear improvement over the previous state-of-the-art even when compared to methods that utilize sequence of test frames instead of a single frame.},
  urldate = {2019-08-15},
  date = {2017-03-23},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Melekhov, Iaroslav and Ylioinas, Juha and Kannala, Juho and Rahtu, Esa},
  file = {/Users/abdullah/Zotero/storage/BHN25SMC/1703.html}
}
% == BibLateX quality report for melekhovImagebasedLocalizationUsing2017:
% Unexpected field 'archivePrefix'
% Unexpected field 'primaryClass'
% Missing required field 'journaltitle'

@inproceedings{nohLearningDeconvolutionNetwork2015,
  title = {Learning {{Deconvolution Network}} for {{Semantic Segmentation}}},
  doi = {10.1109/ICCV.2015.178},
  abstract = {We propose a novel semantic segmentation algorithm by learning a deep deconvolution network. We learn the network on top of the convolutional layers adopted from VGG 16-layer net. The deconvolution network is composed of deconvolution and unpooling layers, which identify pixelwise class labels and predict segmentation masks. We apply the trained network to each proposal in an input image, and construct the final semantic segmentation map by combining the results from all proposals in a simple manner. The proposed algorithm mitigates the limitations of the existing methods based on fully convolutional networks by integrating deep deconvolution network and proposal-wise prediction, our segmentation method typically identifies detailed structures and handles objects in multiple scales naturally. Our network demonstrates outstanding performance in PASCAL VOC 2012 dataset, and we achieve the best accuracy (72.5\%) among the methods trained without using Microsoft COCO dataset through ensemble with the fully convolutional network.},
  eventtitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  booktitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  date = {2015-12},
  pages = {1520-1528},
  keywords = {convolution,convolutional neural network,neural nets,learning (artificial intelligence),Visualization,Feature extraction,image segmentation,Image segmentation,Shape,CNN,deconvolution,Deconvolution,deconvolution network learning,Image reconstruction,prediction theory,proposal-wise prediction,semantic networks,semantic segmentation algorithm,Semantics},
  author = {Noh, H. and Hong, S. and Han, B.},
  file = {/Users/abdullah/Zotero/storage/QNTL3NFV/Noh et al. - 2015 - Learning Deconvolution Network for Semantic Segmen.pdf;/Users/abdullah/Zotero/storage/89Y7E2D6/7410535.html}
}
% == BibLateX quality report for nohLearningDeconvolutionNetwork2015:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{szegedyGoingDeeperConvolutions2015,
  title = {Going Deeper with Convolutions},
  doi = {10.1109/CVPR.2015.7298594},
  abstract = {We propose a deep convolutional neural network architecture codenamed Inception that achieves the new state of the art for classification and detection in the ImageNet Large-Scale Visual Recognition Challenge 2014 (ILSVRC14). The main hallmark of this architecture is the improved utilization of the computing resources inside the network. By a carefully crafted design, we increased the depth and width of the network while keeping the computational budget constant. To optimize quality, the architectural decisions were based on the Hebbian principle and the intuition of multi-scale processing. One particular incarnation used in our submission for ILSVRC14 is called GoogLeNet, a 22 layers deep network, the quality of which is assessed in the context of classification and detection.},
  eventtitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  date = {2015-06},
  pages = {1-9},
  keywords = {convolution,Neural networks,Computer vision,Visualization,feature extraction,architectural decision,Computer architecture,Convolutional codes,convolutional neural network architecture,decision making,Hebbian learning,Hebbian principle,image classification,neural net architecture,object classification,object detection,Object detection,resource allocation,resource utilization,Sparse matrices},
  author = {Szegedy, C. and {Wei Liu} and {Yangqing Jia} and Sermanet, P. and Reed, S. and Anguelov, D. and Erhan, D. and Vanhoucke, V. and Rabinovich, A.},
  file = {/Users/abdullah/Zotero/storage/5MMLPPPN/Szegedy et al. - 2015 - Going deeper with convolutions.pdf;/Users/abdullah/Zotero/storage/AGPDRYEM/Szegedy et al. - 2015 - Going Deeper With Convolutions.pdf;/Users/abdullah/Zotero/storage/HVTTRNEL/7298594.html}
}
% == BibLateX quality report for szegedyGoingDeeperConvolutions2015:
% ? Unsure about the formatting of the booktitle

@inproceedings{clarkVidLocDeepSpatioTemporal2017,
  title = {{{VidLoc}}: {{A Deep Spatio}}-{{Temporal Model}} for 6-{{DoF Video}}-{{Clip Relocalization}}},
  doi = {10.1109/CVPR.2017.284},
  shorttitle = {{{VidLoc}}},
  abstract = {Machine learning techniques, namely convolutional neural networks (CNN) and regression forests, have recently shown great promise in performing 6-DoF localization of monocular images. However, in most cases image-sequences, rather only single images, are readily available. To this extent, none of the proposed learning-based approaches exploit the valuable constraint of temporal smoothness, often leading to situations where the per-frame error is larger than the camera motion. In this paper we propose a recurrent model for performing 6-DoF localization of video-clips. We find that, even by considering only short sequences (20 frames), the pose estimates are smoothed and the localization error can be drastically reduced. Finally, we consider means of obtaining probabilistic pose estimates from our model. We evaluate our method on openly-available real-world autonomous driving and indoor localization datasets.},
  eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  date = {2017-07},
  pages = {2652-2660},
  keywords = {Cameras,neural nets,regression analysis,Computer vision,learning (artificial intelligence),video signal processing,Feature extraction,Three-dimensional displays,Hidden Markov models,Computer architecture,image classification,6-DoF localization,Computational modeling,deep spatio-temporal model,image sequences,image-sequences,indoor localization datasets,localization error,machine learning techniques,monocular images,regression forests,temporal smoothness,video-clip relocalization,video-clips},
  author = {Clark, R. and Wang, S. and Markham, A. and Trigoni, N. and Wen, H.},
  file = {/Users/abdullah/Zotero/storage/3L3DMXWG/Clark et al. - 2017 - VidLoc A Deep Spatio-Temporal Model for 6-DoF Vid.pdf;/Users/abdullah/Zotero/storage/BPATFCW4/Clark et al. - 2017 - VidLoc A Deep Spatio-Temporal Model for 6-DoF Vid.pdf;/Users/abdullah/Zotero/storage/HTFR537Q/8099767.html}
}
% == BibLateX quality report for clarkVidLocDeepSpatioTemporal2017:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{girshickFastRCNN2015a,
  title = {Fast {{R}}-{{CNN}}},
  doi = {10.1109/ICCV.2015.169},
  abstract = {This paper proposes a Fast Region-based Convolutional Network method (Fast R-CNN) for object detection. Fast R-CNN builds on previous work to efficiently classify object proposals using deep convolutional networks. Compared to previous work, Fast R-CNN employs several innovations to improve training and testing speed while also increasing detection accuracy. Fast R-CNN trains the very deep VGG16 network 9x faster than R-CNN, is 213x faster at test-time, and achieves a higher mAP on PASCAL VOC 2012. Compared to SPPnet, Fast R-CNN trains VGG16 3x faster, tests 10x faster, and is more accurate. Fast R-CNN is implemented in Python and C++ (using Caffe) and is available under the open-source MIT License at https://github.com/rbgirshick/fast-rcnn.},
  eventtitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  booktitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  date = {2015-12},
  pages = {1440-1448},
  keywords = {Training,Feature extraction,Computer architecture,object detection,Object detection,C++,Caffe,fast R-CNN,fast region-based convolutional network method,feedforward neural nets,Open source software,open-source MIT License,Pipelines,Proposals,Python,VGG16 network},
  author = {Girshick, R.},
  file = {/Users/abdullah/Zotero/storage/3TBVNF6E/Girshick - 2015 - Fast R-CNN.pdf;/Users/abdullah/Zotero/storage/S6UH5QTA/7410526.html}
}
% == BibLateX quality report for girshickFastRCNN2015a:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{girshickRichFeatureHierarchies2014,
  title = {Rich {{Feature Hierarchies}} for {{Accurate Object Detection}} and {{Semantic Segmentation}}},
  doi = {10.1109/CVPR.2014.81},
  abstract = {Object detection performance, as measured on the canonical PASCAL VOC dataset, has plateaued in the last few years. The best-performing methods are complex ensemble systems that typically combine multiple low-level image features with high-level context. In this paper, we propose a simple and scalable detection algorithm that improves mean average precision (mAP) by more than 30\% relative to the previous best result on VOC 2012 – achieving a mAP of 53.3\%. Our approach combines two key insights: (1) one can apply high-capacity convolutional neural networks (CNNs) to bottom-up region proposals in order to localize and segment objects and (2) when labeled training data is scarce, supervised pre-training for an auxiliary task, followed by domain-specific fine-tuning, yields a significant performance boost. Since we combine region proposals with CNNs, we call our method R-CNN: Regions with CNN features. We also present experiments that provide insight into what the network learns, revealing a rich hierarchy of image features. Source code for the complete system is available at http://www.cs.berkeley.edu/ rbg/rcnn.},
  eventtitle = {2014 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  booktitle = {2014 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  date = {2014-06},
  pages = {580-587},
  keywords = {neural nets,Training,Visualization,Feature extraction,image segmentation,object detection,Object detection,Proposals,auxiliary task,bottom-up region proposal,canonical PASCAL VOC dataset,detection algorithm,domain-specific fine-tuning,high-capacity convolutional neural network,image features,labeled training data,low-level image feature,mAP,mean average precision,object detection performance,performance boost,R-CNN,rich feature hierarchy,segment objects,semantic segmentation,source code,supervised pretraining,Support vector machines,Vectors},
  author = {Girshick, R. and Donahue, J. and Darrell, T. and Malik, J.},
  file = {/Users/abdullah/Zotero/storage/S6XY4P5N/Girshick et al. - 2014 - Rich Feature Hierarchies for Accurate Object Detec.pdf;/Users/abdullah/Zotero/storage/QL5GJF7F/6909475.html}
}
% == BibLateX quality report for girshickRichFeatureHierarchies2014:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{heDeepResidualLearning2016,
  title = {Deep {{Residual Learning}} for {{Image Recognition}}},
  doi = {10.1109/CVPR.2016.90},
  abstract = {Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers - 8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57\% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28\% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC \& COCO 2015 competitions1, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.},
  eventtitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  date = {2016-06},
  pages = {770-778},
  keywords = {neural nets,Neural networks,Training,learning (artificial intelligence),Visualization,Image segmentation,image classification,object detection,CIFAR-10,COCO object detection dataset,COCO segmentation,Complexity theory,deep residual learning,deep residual nets,deeper neural network training,Degradation,ILSVRC & COCO 2015 competitions,ILSVRC 2015 classification task,image recognition,Image recognition,ImageNet dataset,ImageNet localization,ImageNet test set,residual function learning,residual nets,VGG nets,visual recognition tasks},
  author = {He, K. and Zhang, X. and Ren, S. and Sun, J.},
  file = {/Users/abdullah/Zotero/storage/BXJWIK3C/He et al. - 2015 - Deep Residual Learning for Image Recognition.pdf;/Users/abdullah/Zotero/storage/XSVJGSPE/He et al. - 2016 - Deep Residual Learning for Image Recognition.pdf;/Users/abdullah/Zotero/storage/XUZ46IP4/7780459.html}
}
% == BibLateX quality report for heDeepResidualLearning2016:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{kendallModellingUncertaintyDeep2016,
  title = {Modelling Uncertainty in Deep Learning for Camera Relocalization},
  doi = {10.1109/ICRA.2016.7487679},
  abstract = {We present a robust and real-time monocular six degree of freedom visual relocalization system. We use a Bayesian convolutional neural network to regress the 6-DOF camera pose from a single RGB image. It is trained in an end-to-end manner with no need of additional engineering or graph optimisation. The algorithm can operate indoors and outdoors in real time, taking under 6ms to compute. It obtains approximately 2m and 6° accuracy for very large scale outdoor scenes and 0.5m and 10° accuracy indoors. Using a Bayesian convolutional neural network implementation we obtain an estimate of the model's relocalization uncertainty and improve state of the art localization accuracy on a large scale outdoor dataset. We leverage the uncertainty measure to estimate metric relocalization error and to detect the presence or absence of the scene in the input image. We show that the model's uncertainty is caused by images being dissimilar to the training dataset in either pose or appearance.},
  eventtitle = {2016 {{IEEE International Conference}} on {{Robotics}} and {{Automation}} ({{ICRA}})},
  booktitle = {2016 {{IEEE International Conference}} on {{Robotics}} and {{Automation}} ({{ICRA}})},
  date = {2016-05},
  pages = {4762-4769},
  keywords = {Cameras,neural nets,Neural networks,Simultaneous localization and mapping,single RGB image,deep learning,learning (artificial intelligence),robot vision,Computational modeling,6-DOF camera pose,Bayes methods,Bayesian convolutional neural network,camera relocalization,cameras,graph optimisation,graph theory,Measurement uncertainty,metric relocalization error,modelling uncertainty,optimisation,real-time monocular six degree-of-freedom visual relocalization system,robust monocular six degree of freedom visual relocalization system,SLAM (robots),Uncertainty},
  author = {Kendall, A. and Cipolla, R.},
  file = {/Users/abdullah/Zotero/storage/BPLYAVBN/Kendall and Cipolla - 2016 - Modelling uncertainty in deep learning for camera .pdf;/Users/abdullah/Zotero/storage/S7UZMUB7/Kendall und Cipolla - 2015 - Modelling Uncertainty in Deep Learning for Camera .pdf;/Users/abdullah/Zotero/storage/GPQZDMTY/7487679.html}
}
% == BibLateX quality report for kendallModellingUncertaintyDeep2016:
% ? Unsure about the formatting of the booktitle

@inproceedings{kendallPoseNetConvolutionalNetwork2015,
  title = {{{PoseNet}}: {{A Convolutional Network}} for {{Real}}-{{Time}} 6-{{DOF Camera Relocalization}}},
  doi = {10.1109/ICCV.2015.336},
  shorttitle = {{{PoseNet}}},
  abstract = {We present a robust and real-time monocular six degree of freedom relocalization system. Our system trains a convolutional neural network to regress the 6-DOF camera pose from a single RGB image in an end-to-end manner with no need of additional engineering or graph optimisation. The algorithm can operate indoors and outdoors in real time, taking 5ms per frame to compute. It obtains approximately 2m and 3 degrees accuracy for large scale outdoor scenes and 0.5m and 5 degrees accuracy indoors. This is achieved using an efficient 23 layer deep convnet, demonstrating that convnets can be used to solve complicated out of image plane regression problems. This was made possible by leveraging transfer learning from large scale classification data. We show that the PoseNet localizes from high level features and is robust to difficult lighting, motion blur and different camera intrinsics where point based SIFT registration fails. Furthermore we show how the pose feature that is produced generalizes to other scenes allowing us to regress pose with only a few dozen training examples.},
  eventtitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  booktitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  date = {2015-12},
  pages = {2938-2946},
  keywords = {camera intrinsics,Cameras,convolution,convolutional neural network,image motion analysis,image plane regression problems,large scale classification data,large scale outdoor scenes,monocular six degree of freedom relocalization system,motion blur,neural nets,Neural networks,point based SIFT registration,pose feature,PoseNet,Quaternions,real-time 6-DOF camera relocalization,real-time systems,Real-time systems,regression analysis,Robot vision systems,Simultaneous localization and mapping,single RGB image,Training,transfer learning},
  author = {Kendall, A. and Grimes, M. and Cipolla, R.},
  file = {/Users/abdullah/Zotero/storage/4GR4PBI8/Kendall et al. - 2015 - PoseNet A Convolutional Network for Real-Time 6-D.pdf;/Users/abdullah/Zotero/storage/MZYAH5HL/Kendall et al. - 2015 - PoseNet A Convolutional Network for Real-Time 6-D.pdf;/Users/abdullah/Zotero/storage/LDAKCVFK/7410693.html}
}
% == BibLateX quality report for kendallPoseNetConvolutionalNetwork2015:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{boliDepthSurfaceNormal2015,
  title = {Depth and Surface Normal Estimation from Monocular Images Using Regression on Deep Features and Hierarchical {{CRFs}}},
  doi = {10.1109/CVPR.2015.7298715},
  abstract = {Predicting the depth (or surface normal) of a scene from single monocular color images is a challenging task. This paper tackles this challenging and essentially underdetermined problem by regression on deep convolutional neural network (DCNN) features, combined with a post-processing refining step using conditional random fields (CRF). Our framework works at two levels, super-pixel level and pixel level. First, we design a DCNN model to learn the mapping from multi-scale image patches to depth or surface normal values at the super-pixel level. Second, the estimated super-pixel depth or surface normal is refined to the pixel level by exploiting various potentials on the depth or surface normal map, which includes a data term, a smoothness term among super-pixels and an auto-regression term characterizing the local structure of the estimation map. The inference problem can be efficiently solved because it admits a closed-form solution. Experiments on the Make3D and NYU Depth V2 datasets show competitive results compared with recent state-of-the-art methods.},
  eventtitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  date = {2015-06},
  pages = {1119-1127},
  keywords = {neural nets,regression analysis,Training,learning (artificial intelligence),Feature extraction,Three-dimensional displays,feature extraction,Estimation,monocular images,auto-regression term,closed-form solution,Color,conditional random fields,Context,CRF,data term,DCNN model,deep features,deep-convolutional neural network features,depth normal estimation,depth normal map,depth Predicting,estimation map,hierarchical CRF,image colour analysis,inference mechanisms,inference problem,local structure,Make3D dataset,mapping learning,multiscale image patches,NYU Depth V2 dataset,postprocessing refining step,smoothing methods,smoothness term,super-pixel level,surface normal estimation,surface normal map,Training data},
  author = {{Bo Li} and {Chunhua Shen} and {Yuchao Dai} and van den Hengel, A. and {Mingyi He}},
  file = {/Users/abdullah/Zotero/storage/W545C33U/Li et al. - 2015 - Depth and Surface Normal Estimation From Monocular.pdf;/Users/abdullah/Zotero/storage/SP897X76/7298715.html}
}
% == BibLateX quality report for boliDepthSurfaceNormal2015:
% ? Unsure about the formatting of the booktitle

@inproceedings{massicetiRandomForestsNeural2017,
  title = {Random Forests versus {{Neural Networks}} — {{What}}'s Best for Camera Localization?},
  doi = {10.1109/ICRA.2017.7989598},
  abstract = {This work addresses the task of camera localization in a known 3D scene given a single input RGB image. State-of-the-art approaches accomplish this in two steps: firstly, regressing for every pixel in the image its 3D scene coordinate and subsequently, using these coordinates to estimate the final 6D camera pose via RANSAC. To solve the first step. Random Forests (RFs) are typically used. On the other hand. Neural Networks (NNs) reign in many dense regression tasks, but are not test-time efficient. We ask the question: which of the two is best for camera localization? To address this, we make two method contributions: (1) a test-time efficient NN architecture which we term a ForestNet that is derived and initialized from a RF, and (2) a new fully-differentiable robust averaging technique for regression ensembles which can be trained end-to-end with a NN. Our experimental findings show that for scene coordinate regression, traditional NN architectures are superior to test-time efficient RFs and ForestNets, however, this does not translate to final 6D camera pose accuracy where RFs and ForestNets perform slightly better. To summarize, our best method, a ForestNet with a robust average, which has an equivalent fast and lightweight RF, improves over the state-of-the-art for camera localization on the 7-Scenes dataset [1]. While this work focuses on scene coordinate regression for camera localization, our innovations may also be applied to other continuous regression tasks.},
  eventtitle = {2017 {{IEEE International Conference}} on {{Robotics}} and {{Automation}} ({{ICRA}})},
  booktitle = {2017 {{IEEE International Conference}} on {{Robotics}} and {{Automation}} ({{ICRA}})},
  date = {2017-05},
  pages = {5118-5125},
  keywords = {Cameras,neural nets,regression analysis,Training,learning (artificial intelligence),Three-dimensional displays,Robustness,cameras,image colour analysis,3D scene coordinate,Artificial neural networks,camera localization,dense regression task,ForestNet,fully-differentiable robust averaging technique,neural networks,Radio frequency,random forests,red-green-blue image,regression ensembles,scene coordinate regression,single input RGB image,Vegetation},
  author = {Massiceti, D. and Krull, A. and Brachmann, E. and Rother, C. and Torr, P. H. S.},
  file = {/Users/abdullah/Zotero/storage/UHPUTCQB/Massiceti et al. - 2016 - Random Forests versus Neural Networks - What's Bes.pdf;/Users/abdullah/Zotero/storage/VVUAH6QP/Massiceti et al. - 2017 - Random forests versus Neural Networks — What's bes.pdf;/Users/abdullah/Zotero/storage/HSJKC9ZM/7989598.html}
}
% == BibLateX quality report for massicetiRandomForestsNeural2017:
% ? Unsure about the formatting of the booktitle

@inproceedings{pengLearningDeepObject2015,
  title = {Learning {{Deep Object Detectors}} from {{3D Models}}},
  doi = {10.1109/ICCV.2015.151},
  abstract = {Crowdsourced 3D CAD models are easily accessible online, and can potentially generate an infinite number of training images for almost any object category. We show that augmenting the training data of contemporary Deep Convolutional Neural Net (DCNN) models with such synthetic data can be effective, especially when real training data is limited or not well matched to the target domain. Most freely available CAD models capture 3D shape but are often missing other low level cues, such as realistic object texture, pose, or background. In a detailed analysis, we use synthetic CAD images to probe the ability of DCNN to learn without these cues, with surprising findings. In particular, we show that when the DCNN is fine-tuned on the target detection task, it exhibits a large degree of invariance to missing low-level cues, but, when pretrained on generic ImageNet classification, it learns better when the low-level cues are simulated. We show that our synthetic DCNN training approach significantly outperforms previous methods on the benchmark PASCAL VOC2007 dataset when learning in the few-shot scenario and improves performance in a domain shift scenario on the Office benchmark.},
  eventtitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  booktitle = {2015 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  date = {2015-12},
  pages = {1278-1286},
  keywords = {convolution,neural nets,Training,learning (artificial intelligence),Three-dimensional displays,Solid modeling,object detection,DCNN model,3D CAD model,CAD,CAD image,Data models,deep convolutional neural net,deep object detector learning,Design automation,Detectors,Image color analysis,solid modelling,target detection},
  author = {Peng, X. and Sun, B. and Ali, K. and Saenko, K.},
  file = {/Users/abdullah/Zotero/storage/EWJTZLXC/Peng et al. - 2015 - Learning Deep Object Detectors from 3D Models.pdf;/Users/abdullah/Zotero/storage/2A8ED7BG/7410508.html}
}
% == BibLateX quality report for pengLearningDeepObject2015:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{szegedyRethinkingInceptionArchitecture2016,
  title = {Rethinking the {{Inception Architecture}} for {{Computer Vision}}},
  doi = {10.1109/CVPR.2016.308},
  abstract = {Convolutional networks are at the core of most state of-the-art computer vision solutions for a wide variety of tasks. Since 2014 very deep convolutional networks started to become mainstream, yielding substantial gains in various benchmarks. Although increased model size and computational cost tend to translate to immediate quality gains for most tasks (as long as enough labeled data is provided for training), computational efficiency and low parameter count are still enabling factors for various use cases such as mobile vision and big-data scenarios. Here we are exploring ways to scale up networks in ways that aim at utilizing the added computation as efficiently as possible by suitably factorized convolutions and aggressive regularization. We benchmark our methods on the ILSVRC 2012 classification challenge validation set demonstrate substantial gains over the state of the art: 21:2\% top-1 and 5:6\% top-5 error for single frame evaluation using a network with a computational cost of 5 billion multiply-adds per inference and with using less than 25 million parameters. With an ensemble of 4 models and multi-crop evaluation, we report 3:5\% top-5 error and 17:3\% top-1 error on the validation set and 3:6\% top-5 error on the official test set.},
  eventtitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  date = {2016-06},
  pages = {2818-2826},
  keywords = {neural nets,Training,computer vision,Computer vision,Computer architecture,image classification,Computational modeling,Benchmark testing,Computational efficiency,Convolution,deep convolutional networks,ILSVRC 2012 classification challenge validation set,inception architecture},
  author = {Szegedy, C. and Vanhoucke, V. and Ioffe, S. and Shlens, J. and Wojna, Z.},
  file = {/Users/abdullah/Zotero/storage/9C2VX3XT/Szegedy et al. - 2016 - Rethinking the Inception Architecture for Computer.pdf;/Users/abdullah/Zotero/storage/HUVICXKG/Szegedy et al. - 2016 - Rethinking the Inception Architecture for Computer.pdf;/Users/abdullah/Zotero/storage/B8YPRBN8/7780677.html}
}
% == BibLateX quality report for szegedyRethinkingInceptionArchitecture2016:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{walchImageBasedLocalizationUsing2017,
  title = {Image-{{Based Localization Using LSTMs}} for {{Structured Feature Correlation}}},
  doi = {10.1109/ICCV.2017.75},
  abstract = {In this work we propose a new CNN+LSTM architecture for camera pose regression for indoor and outdoor scenes. CNNs allow us to learn suitable feature representations for localization that are robust against motion blur and illumination changes. We make use of LSTM units on the CNN output, which play the role of a structured dimensionality reduction on the feature vector, leading to drastic improvements in localization performance. We provide extensive quantitative comparison of CNN-based and SIFT-based localization methods, showing the weaknesses and strengths of each. Furthermore, we present a new large-scale indoor dataset with accurate ground truth from a laser scanner. Experimental results on both indoor and outdoor public datasets show our method outperforms existing deep architectures, and can localize images in hard conditions, e.g., in the presence of mostly textureless surfaces, where classic SIFT-based methods fail.},
  eventtitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  booktitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  date = {2017-10},
  pages = {627-637},
  keywords = {Cameras,regression analysis,Training,Feature extraction,Three-dimensional displays,Pose estimation,feature extraction,Solid modeling,transforms,pose estimation,feedforward neural nets,cameras,camera pose regression,CNN output,CNN-based localization method,CNN+LSTM architecture,correlation methods,feature representations,feature vector,image representation,image-based localization,indoor public datasets,indoor scenes,large-scale indoor dataset,laser scanner,localization performance,LSTM units,outdoor public datasets,outdoor scenes,SIFT-based localization method,structured dimensionality reduction,structured feature correlation},
  author = {Walch, F. and Hazirbas, C. and Leal-Taixé, L. and Sattler, T. and Hilsenbeck, S. and Cremers, D.},
  file = {/Users/abdullah/Zotero/storage/FCNPUFWW/Walch et al. - 2017 - Image-Based Localization Using LSTMs for Structure.pdf;/Users/abdullah/Zotero/storage/G8DRZ48A/Walch et al. - 2016 - Image-based localization using LSTMs for structure.pdf;/Users/abdullah/Zotero/storage/K9G5HTIV/8237337.html}
}
% == BibLateX quality report for walchImageBasedLocalizationUsing2017:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@article{wuImagebasedCameraLocalization2018,
  langid = {english},
  title = {Image-Based Camera Localization: An Overview},
  volume = {1},
  issn = {2524-4442},
  doi = {10.1186/s42492-018-0008-z},
  shorttitle = {Image-Based Camera Localization},
  abstract = {Virtual reality, augmented reality, robotics, and autonomous driving, have recently attracted much attention from both academic and industrial communities, in which image-based camera localization is a key task. However, there has not been a complete review on image-based camera localization. It is urgent to map this topic to enable individuals enter the field quickly. In this paper, an overview of image-based camera localization is presented. A new and complete classification of image-based camera localization approaches is provided and the related techniques are introduced. Trends for future development are also discussed. This will be useful not only to researchers, but also to engineers and other individuals interested in this field.},
  number = {1},
  journaltitle = {Vis. Comput. Ind. Biomed. Art},
  date = {2018-09-05},
  pages = {8},
  keywords = {Camera localization,Camera pose determination,PnP problem,SLAM},
  author = {Wu, Yihong and Tang, Fulin and Li, Heping},
  file = {/Users/abdullah/Zotero/storage/C83ZPLLE/Wu et al. - 2018 - Image-based camera localization an overview.pdf;/Users/abdullah/Zotero/storage/F3YXQJQ3/Wu et al. - 2016 - Image Based Camera Localization an Overview.pdf}
}
% == BibLateX quality report for wuImagebasedCameraLocalization2018:
% ? Possibly abbreviated journal title Vis. Comput. Ind. Biomed. Art

@inproceedings{brahmbhattGeometryAwareLearningMaps2018,
  title = {Geometry-{{Aware Learning}} of {{Maps}} for {{Camera Localization}}},
  doi = {10.1109/CVPR.2018.00277},
  abstract = {Maps are a key component in image-based camera localization and visual SLAM systems: they are used to establish geometric constraints between images, correct drift in relative pose estimation, and relocalize cameras after lost tracking. The exact definitions of maps, however, are often application-specific and hand-crafted for different scenarios (e.g. 3D landmarks, lines, planes, bags of visual words). We propose to represent maps as a deep neural net called MapNet, which enables learning a data-driven map representation. Unlike prior work on learning maps, MapNet exploits cheap and ubiquitous sensory inputs like visual odometry and GPS in addition to images and fuses them together for camera localization. Geometric constraints expressed by these inputs, which have traditionally been used in bundle adjustment or pose-graph optimization, are formulated as loss terms in MapNet training and also used during inference. In addition to directly improving localization accuracy, this allows us to update the MapNet (i.e., maps) in a self-supervised manner using additional unlabeled video sequences from the scene. We also propose a novel parameterization for camera rotation which is better suited for deep-learning based camera pose regression. Experimental results on both the indoor 7-Scenes dataset and the outdoor Oxford RobotCar dataset show significant performance improvement over prior work. The MapNet project webpage is https://goo.gl/mRB3Au.},
  eventtitle = {2018 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  booktitle = {2018 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  date = {2018-06},
  pages = {2616-2625},
  keywords = {Cameras,neural nets,regression analysis,Robot vision systems,Simultaneous localization and mapping,Training,learning (artificial intelligence),Visualization,Three-dimensional displays,image sequences,cameras,graph theory,optimisation,geometry,inference mechanisms,image representation,3D landmarks,bags of visual words,camera rotation parameterization,data-driven map representation,deep neural net,deep-learning based camera pose regression,geometry-aware learning,GPS,image fusion,image-based camera localization map,indoor 7-Scenes dataset,inference mechanism,localization accuracy,MapNet project webpage,MapNet training,outdoor Oxford RobotCar dataset,pose-graph optimization,relative pose estimation,sensory inputs,unlabeled video sequences,visual odometry,Visual odometry,visual SLAM systems},
  author = {Brahmbhatt, S. and Gu, J. and Kim, K. and Hays, J. and Kautz, J.},
  file = {/Users/abdullah/Zotero/storage/NAT7ATGE/Brahmbhatt et al. - 2018 - Geometry-Aware Learning of Maps for Camera Localiz.pdf;/Users/abdullah/Zotero/storage/X6FAPY7Y/Brahmbhatt et al. - 2018 - Geometry-Aware Learning of Maps for Camera Localiz.pdf;/Users/abdullah/Zotero/storage/852C9B5S/8578375.html}
}
% == BibLateX quality report for brahmbhattGeometryAwareLearningMaps2018:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero

@inproceedings{melekhovImageBasedLocalizationUsing2017a,
  title = {Image-{{Based Localization Using Hourglass Networks}}},
  doi = {10.1109/ICCVW.2017.107},
  abstract = {In this paper, we propose an encoder-decoder convolutional neural network (CNN) architecture for estimating camera pose (orientation and location) from a single RGB-image. The architecture has a hourglass shape consisting of a chain of convolution and up-convolution layers followed by a regression part. The up-convolution layers are introduced to preserve the fine-grained information of the input image. Following the common practice, we train our model in end-to-end manner utilizing transfer learning from large scale classification data. The experiments demonstrate the performance of the approach on data exhibiting different lighting conditions, reflections, and motion blur The results indicate a clear improvement over the previous state-of-the-art even when compared to methods that utilize sequence of test frames instead of a single frame.},
  eventtitle = {2017 {{IEEE International Conference}} on {{Computer Vision Workshops}} ({{ICCVW}})},
  booktitle = {2017 {{IEEE International Conference}} on {{Computer Vision Workshops}} ({{ICCVW}})},
  date = {2017-10},
  pages = {870-877},
  keywords = {Cameras,convolution,image motion analysis,transfer learning,learning (artificial intelligence),Feature extraction,Three-dimensional displays,feature extraction,Solid modeling,pose estimation,Computer architecture,Convolutional codes,image classification,feedforward neural nets,cameras,image colour analysis,classification data,Decoding,encoder-decoder convolutional neural network architecture,fine-grained information,hourglass networks,hourglass shape,regression part,single RGB-image},
  author = {Melekhov, I. and Ylioinas, J. and Kannala, J. and Rahtu, E.},
  file = {/Users/abdullah/Zotero/storage/KID3IIDP/Melekhov et al. - 2017 - Image-based Localization using Hourglass Networks.pdf;/Users/abdullah/Zotero/storage/QLDNKVD4/Melekhov et al. - 2017 - Image-Based Localization Using Hourglass Networks.pdf;/Users/abdullah/Zotero/storage/H5H46L3X/8265316.html}
}
% == BibLateX quality report for melekhovImageBasedLocalizationUsing2017a:
% ? Unsure about the formatting of the booktitle
% ? Title looks like it was stored in title-case in Zotero