diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 33e9f4a..0000000 --- a/Dockerfile +++ /dev/null @@ -1,65 +0,0 @@ -# Base image -FROM nvidia/cudagl:10.1-devel-ubuntu16.04 - -# Setup basic packages -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - git \ - curl \ - vim \ - ca-certificates \ - libjpeg-dev \ - libpng-dev \ - libglfw3-dev \ - libglm-dev \ - libx11-dev \ - libomp-dev \ - libegl1-mesa-dev \ - pkg-config \ - wget \ - zip \ - htop \ - tmux \ - unzip &&\ - rm -rf /var/lib/apt/lists/* - -# Install conda -RUN wget -O $HOME/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh &&\ - chmod +x ~/miniconda.sh &&\ - ~/miniconda.sh -b -p /custom/conda &&\ - rm ~/miniconda.sh &&\ - /custom/conda/bin/conda install numpy pyyaml scipy ipython mkl mkl-include &&\ - /custom/conda/bin/conda clean -ya -ENV PATH /custom/conda/bin:$PATH - -# Install cmake -RUN wget https://github.com/Kitware/CMake/releases/download/v3.14.0/cmake-3.14.0-Linux-x86_64.sh -RUN mkdir /opt/cmake -RUN sh /cmake-3.14.0-Linux-x86_64.sh --prefix=/opt/cmake --skip-license -RUN ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake -RUN cmake --version - -# Setup habitat-sim -RUN git clone https://github.com/facebookresearch/habitat-sim.git -RUN /bin/bash -c "cd habitat-sim; git checkout tags/v0.1.5; pip install -r requirements.txt; python setup.py install --headless --with-cuda" - -# Install challenge specific habitat-api -RUN git clone https://github.com/facebookresearch/habitat-api.git -RUN /bin/bash -c "cd habitat-api; git checkout tags/v0.1.5; pip install -e ." -RUN /bin/bash -c "cd habitat-api; wget http://dl.fbaipublicfiles.com/habitat/habitat-test-scenes.zip; unzip habitat-test-scenes.zip" - -# Silence habitat-sim logs -ENV GLOG_minloglevel=2 -ENV MAGNUM_LOG="quiet" - -# Install project specific packages -RUN /bin/bash -c "apt-get update; apt-get install -y libsm6 libxext6 libxrender-dev; pip install opencv-python" -RUN /bin/bash -c "pip install --upgrade cython numpy" -RUN /bin/bash -c "pip install matplotlib seaborn==0.9.0 scikit-fmm==2019.1.30 scikit-image==0.15.0 imageio==2.6.0 scikit-learn==0.22.2.post1 ifcfg" - -# Install pytorch and torch_scatter -RUN conda install pytorch=1.6.0 torchvision=0.7.0 cudatoolkit=10.2 -c pytorch -RUN /bin/bash -c "pip install torch_scatter" - -# Install detectron2 -RUN /bin/bash -c "python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.6/index.html" diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 9f37f1a..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2020 Devendra Chaplot - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md deleted file mode 100644 index e7a6f76..0000000 --- a/README.md +++ /dev/null @@ -1,155 +0,0 @@ -# Object Goal Navigation using Goal-Oriented Semantic Exploration -This is a PyTorch implementation of the NeurIPS-20 paper: - -[Object Goal Navigation using Goal-Oriented Semantic Exploration](https://arxiv.org/pdf/2007.00643.pdf)
-Devendra Singh Chaplot, Dhiraj Gandhi, Abhinav Gupta, Ruslan Salakhutdinov
-Carnegie Mellon University, Facebook AI Research - -Winner of the [CVPR 2020 Habitat ObjectNav Challenge](https://aihabitat.org/challenge/2020/). - -Project Website: https://devendrachaplot.github.io/projects/semantic-exploration - -![example](./docs/example.gif) - -### Overview: -The Goal-Oriented Semantic Exploration (SemExp) model consists of three modules: a Semantic Mapping Module, a Goal-Oriented Semantic Policy, and a deterministic Local Policy. -As shown below, the Semantic Mapping model builds a semantic map over time. The Goal-Oriented Semantic Policy selects a long-term goal based on the semantic -map to reach the given object goal efficiently. A deterministic local policy based on analytical planners is used to take low-level navigation actions to reach the long-term goal. - -![overview](./docs/overview.jpg) - -### This repository contains: -- Episode train and test datasets for [Object Goal Navigation](https://arxiv.org/pdf/2007.00643.pdf) task for the Gibson dataset in the Habitat Simulator. -- The code to train and evaluate the Semantic Exploration (SemExp) model on the Object Goal Navigation task. -- Pretrained SemExp model. - -## Installing Dependencies -- We use earlier versions of [habitat-sim](https://github.com/facebookresearch/habitat-sim) and [habitat-lab](https://github.com/facebookresearch/habitat-lab) as specified below: - -Installing habitat-sim: -``` -git clone https://github.com/facebookresearch/habitat-sim.git -cd habitat-sim; git checkout tags/v0.1.5; -pip install -r requirements.txt; -python setup.py install --headless -python setup.py install # (for Mac OS) -``` - -Installing habitat-lab: -``` -git clone https://github.com/facebookresearch/habitat-lab.git -cd habitat-lab; git checkout tags/v0.1.5; -pip install -e . -``` -Check habitat installation by running `python examples/benchmark.py` in the habitat-lab folder. - -- Install [pytorch](https://pytorch.org/) according to your system configuration. The code is tested on pytorch v1.6.0 and cudatoolkit v10.2. If you are using conda: -``` -conda install pytorch==1.6.0 torchvision==0.7.0 cudatoolkit=10.2 #(Linux with GPU) -conda install pytorch==1.6.0 torchvision==0.7.0 -c pytorch #(Mac OS) -``` - -- Install [detectron2](https://github.com/facebookresearch/detectron2/) according to your system configuration. If you are using conda: -``` -python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.6/index.html #(Linux with GPU) -CC=clang CXX=clang++ ARCHFLAGS="-arch x86_64" python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' #(Mac OS) -``` - -### Docker and Singularity images: -We provide experimental [docker](https://www.docker.com/) and [singularity](https://sylabs.io/) images with all the dependencies installed, see [Docker Instructions](./docs/DOCKER_INSTRUCTIONS.md). - - -## Setup -Clone the repository and install other requirements: -``` -git clone https://github.com/devendrachaplot/Object-Goal-Navigation/ -cd Object-Goal-Navigation/; -pip install -r requirements.txt -``` - -### Downloading scene dataset -- Download the Gibson dataset using the instructions here: https://github.com/facebookresearch/habitat-lab#scenes-datasets (download the 11GB file `gibson_habitat_trainval.zip`) -- Move the Gibson scene dataset or create a symlink at `data/scene_datasets/gibson_semantic`. - -### Downloading episode dataset -- Download the episode dataset: -``` -wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1tslnZAkH8m3V5nP8pbtBmaR2XEfr8Rau' -O objectnav_gibson_v1.1.zip -``` -- Unzip the dataset into `data/datasets/objectnav/gibson/v1.1/` - -### Setting up datasets -The code requires the datasets in a `data` folder in the following format (same as habitat-lab): -``` -Object-Goal-Navigation/ - data/ - scene_datasets/ - gibson_semantic/ - Adrian.glb - Adrian.navmesh - ... - datasets/ - objectnav/ - gibson/ - v1.1/ - train/ - val/ -``` - - -### Test setup -To verify that the data is setup correctly, run: -``` -python test.py --agent random -n1 --num_eval_episodes 1 --auto_gpu_config 0 -``` - -## Usage - -### Training: -For training the SemExp model on the Object Goal Navigation task: -``` -python main.py -``` - -### Downloading pre-trained models -``` -mkdir pretrained_models; -wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=171ZA7XNu5vi3XLpuKs8DuGGZrYyuSjL0' -O pretrained_models/sem_exp.pth -``` - -### For evaluation: -For evaluating the pre-trained model: -``` -python main.py --split val --eval 1 --load pretrained_models/sem_exp.pth -``` - -For visualizing the agent observations and predicted semantic map, add `-v 1` as an argument to the above command. - -The pre-trained model should get 0.657 Success, 0.339 SPL and 1.474 DTG. - -For more detailed instructions, see [INSTRUCTIONS](./docs/INSTRUCTIONS.md). - - -## Cite as ->Chaplot, D.S., Gandhi, D., Gupta, A. and Salakhutdinov, R., 2020. Object Goal Navigation using Goal-Oriented Semantic Exploration. In Neural Information Processing Systems (NeurIPS-20). ([PDF](https://arxiv.org/pdf/2007.00643.pdf)) - -### Bibtex: -``` -@inproceedings{chaplot2020object, - title={Object Goal Navigation using Goal-Oriented Semantic Exploration}, - author={Chaplot, Devendra Singh and Gandhi, Dhiraj and - Gupta, Abhinav and Salakhutdinov, Ruslan}, - booktitle={In Neural Information Processing Systems (NeurIPS)}, - year={2020} - } -``` - -## Related Projects -- This project builds on the [Active Neural SLAM](https://devendrachaplot.github.io/projects/Neural-SLAM) paper. The code and pretrained models for the Active Neural SLAM system are available at: -https://github.com/devendrachaplot/Neural-SLAM. -- The Semantic Mapping module is similar to the one used in [Semantic Curiosity](https://devendrachaplot.github.io/projects/SemanticCuriosity). - -## Acknowledgements -This repository uses [Habitat Lab](https://github.com/facebookresearch/habitat-lab) implementation for running the RL environment. -The implementation of PPO is borrowed from [ikostrikov/pytorch-a2c-ppo-acktr-gail](https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/). -The Mask-RCNN implementation is based on the [detectron2](https://github.com/facebookresearch/detectron2/) repository. We would also like to thank Shubham Tulsiani and Saurabh Gupta for their help in implementing some parts of the code. diff --git a/agents/sem_exp.py b/agents/sem_exp.py deleted file mode 100644 index 80283fc..0000000 --- a/agents/sem_exp.py +++ /dev/null @@ -1,416 +0,0 @@ -import math -import os -import cv2 -import numpy as np -import skimage.morphology -from PIL import Image -from torchvision import transforms - -from envs.utils.fmm_planner import FMMPlanner -from envs.habitat.objectgoal_env import ObjectGoal_Env -from agents.utils.semantic_prediction import SemanticPredMaskRCNN -from constants import color_palette -import envs.utils.pose as pu -import agents.utils.visualization as vu - - -class Sem_Exp_Env_Agent(ObjectGoal_Env): - """The Sem_Exp environment agent class. A seperate Sem_Exp_Env_Agent class - object is used for each environment thread. - - """ - - def __init__(self, args, rank, config_env, dataset): - - self.args = args - super().__init__(args, rank, config_env, dataset) - - # initialize transform for RGB observations - self.res = transforms.Compose( - [transforms.ToPILImage(), - transforms.Resize((args.frame_height, args.frame_width), - interpolation=Image.NEAREST)]) - - # initialize semantic segmentation prediction model - if args.sem_gpu_id == -1: - args.sem_gpu_id = config_env.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID - - self.sem_pred = SemanticPredMaskRCNN(args) - - # initializations for planning: - self.selem = skimage.morphology.disk(3) - - self.obs = None - self.obs_shape = None - self.collision_map = None - self.visited = None - self.visited_vis = None - self.col_width = None - self.curr_loc = None - self.last_loc = None - self.last_action = None - self.count_forward_actions = None - - if args.visualize or args.print_images: - self.legend = cv2.imread('docs/legend.png') - self.vis_image = None - self.rgb_vis = None - - def reset(self): - args = self.args - - obs, info = super().reset() - obs = self._preprocess_obs(obs) - - self.obs_shape = obs.shape - - # Episode initializations - map_shape = (args.map_size_cm // args.map_resolution, - args.map_size_cm // args.map_resolution) - self.collision_map = np.zeros(map_shape) - self.visited = np.zeros(map_shape) - self.visited_vis = np.zeros(map_shape) - self.col_width = 1 - self.count_forward_actions = 0 - self.curr_loc = [args.map_size_cm / 100.0 / 2.0, - args.map_size_cm / 100.0 / 2.0, 0.] - self.last_action = None - - if args.visualize or args.print_images: - self.vis_image = vu.init_vis_image(self.goal_name, self.legend) - - return obs, info - - def plan_act_and_preprocess(self, planner_inputs): - """Function responsible for planning, taking the action and - preprocessing observations - - Args: - planner_inputs (dict): - dict with following keys: - 'map_pred' (ndarray): (M, M) map prediction - 'goal' (ndarray): (M, M) mat denoting goal locations - 'pose_pred' (ndarray): (7,) array denoting pose (x,y,o) - and planning window (gx1, gx2, gy1, gy2) - 'found_goal' (bool): whether the goal object is found - - Returns: - obs (ndarray): preprocessed observations ((4+C) x H x W) - reward (float): amount of reward returned after previous action - done (bool): whether the episode has ended - info (dict): contains timestep, pose, goal category and - evaluation metric info - """ - - # plan - if planner_inputs["wait"]: - self.last_action = None - self.info["sensor_pose"] = [0., 0., 0.] - return np.zeros(self.obs.shape), 0., False, self.info - - # Reset reward if new long-term goal - if planner_inputs["new_goal"]: - self.info["g_reward"] = 0 - - action = self._plan(planner_inputs) - - if self.args.visualize or self.args.print_images: - self._visualize(planner_inputs) - - if action >= 0: - - # act - action = {'action': action} - obs, rew, done, info = super().step(action) - - # preprocess obs - obs = self._preprocess_obs(obs) - self.last_action = action['action'] - self.obs = obs - self.info = info - - info['g_reward'] += rew - - return obs, rew, done, info - - else: - self.last_action = None - self.info["sensor_pose"] = [0., 0., 0.] - return np.zeros(self.obs_shape), 0., False, self.info - - def _plan(self, planner_inputs): - """Function responsible for planning - - Args: - planner_inputs (dict): - dict with following keys: - 'map_pred' (ndarray): (M, M) map prediction - 'goal' (ndarray): (M, M) goal locations - 'pose_pred' (ndarray): (7,) array denoting pose (x,y,o) - and planning window (gx1, gx2, gy1, gy2) - 'found_goal' (bool): whether the goal object is found - - Returns: - action (int): action id - """ - args = self.args - - self.last_loc = self.curr_loc - - # Get Map prediction - map_pred = np.rint(planner_inputs['map_pred']) - goal = planner_inputs['goal'] - - # Get pose prediction and global policy planning window - start_x, start_y, start_o, gx1, gx2, gy1, gy2 = \ - planner_inputs['pose_pred'] - gx1, gx2, gy1, gy2 = int(gx1), int(gx2), int(gy1), int(gy2) - planning_window = [gx1, gx2, gy1, gy2] - - # Get curr loc - self.curr_loc = [start_x, start_y, start_o] - r, c = start_y, start_x - start = [int(r * 100.0 / args.map_resolution - gx1), - int(c * 100.0 / args.map_resolution - gy1)] - start = pu.threshold_poses(start, map_pred.shape) - - self.visited[gx1:gx2, gy1:gy2][start[0] - 0:start[0] + 1, - start[1] - 0:start[1] + 1] = 1 - - if args.visualize or args.print_images: - # Get last loc - last_start_x, last_start_y = self.last_loc[0], self.last_loc[1] - r, c = last_start_y, last_start_x - last_start = [int(r * 100.0 / args.map_resolution - gx1), - int(c * 100.0 / args.map_resolution - gy1)] - last_start = pu.threshold_poses(last_start, map_pred.shape) - self.visited_vis[gx1:gx2, gy1:gy2] = \ - vu.draw_line(last_start, start, - self.visited_vis[gx1:gx2, gy1:gy2]) - - # Collision check - if self.last_action == 1: - x1, y1, t1 = self.last_loc - x2, y2, _ = self.curr_loc - buf = 4 - length = 2 - - if abs(x1 - x2) < 0.05 and abs(y1 - y2) < 0.05: - self.col_width += 2 - if self.col_width == 7: - length = 4 - buf = 3 - self.col_width = min(self.col_width, 5) - else: - self.col_width = 1 - - dist = pu.get_l2_distance(x1, x2, y1, y2) - if dist < args.collision_threshold: # Collision - width = self.col_width - for i in range(length): - for j in range(width): - wx = x1 + 0.05 * \ - ((i + buf) * np.cos(np.deg2rad(t1)) - + (j - width // 2) * np.sin(np.deg2rad(t1))) - wy = y1 + 0.05 * \ - ((i + buf) * np.sin(np.deg2rad(t1)) - - (j - width // 2) * np.cos(np.deg2rad(t1))) - r, c = wy, wx - r, c = int(r * 100 / args.map_resolution), \ - int(c * 100 / args.map_resolution) - [r, c] = pu.threshold_poses([r, c], - self.collision_map.shape) - self.collision_map[r, c] = 1 - - stg, stop = self._get_stg(map_pred, start, np.copy(goal), - planning_window) - - # Deterministic Local Policy - if stop and planner_inputs['found_goal'] == 1: - action = 0 # Stop - else: - (stg_x, stg_y) = stg - angle_st_goal = math.degrees(math.atan2(stg_x - start[0], - stg_y - start[1])) - angle_agent = (start_o) % 360.0 - if angle_agent > 180: - angle_agent -= 360 - - relative_angle = (angle_agent - angle_st_goal) % 360.0 - if relative_angle > 180: - relative_angle -= 360 - - if relative_angle > self.args.turn_angle / 2.: - action = 3 # Right - elif relative_angle < -self.args.turn_angle / 2.: - action = 2 # Left - else: - action = 1 # Forward - - return action - - def _get_stg(self, grid, start, goal, planning_window): - """Get short-term goal""" - - [gx1, gx2, gy1, gy2] = planning_window - - x1, y1, = 0, 0 - x2, y2 = grid.shape - - def add_boundary(mat, value=1): - h, w = mat.shape - new_mat = np.zeros((h + 2, w + 2)) + value - new_mat[1:h + 1, 1:w + 1] = mat - return new_mat - - traversible = skimage.morphology.binary_dilation( - grid[x1:x2, y1:y2], - self.selem) != True - traversible[self.collision_map[gx1:gx2, gy1:gy2] - [x1:x2, y1:y2] == 1] = 0 - traversible[self.visited[gx1:gx2, gy1:gy2][x1:x2, y1:y2] == 1] = 1 - - traversible[int(start[0] - x1) - 1:int(start[0] - x1) + 2, - int(start[1] - y1) - 1:int(start[1] - y1) + 2] = 1 - - traversible = add_boundary(traversible) - goal = add_boundary(goal, value=0) - - planner = FMMPlanner(traversible) - selem = skimage.morphology.disk(10) - goal = skimage.morphology.binary_dilation( - goal, selem) != True - goal = 1 - goal * 1. - planner.set_multi_goal(goal) - - state = [start[0] - x1 + 1, start[1] - y1 + 1] - stg_x, stg_y, _, stop = planner.get_short_term_goal(state) - - stg_x, stg_y = stg_x + x1 - 1, stg_y + y1 - 1 - - return (stg_x, stg_y), stop - - def _preprocess_obs(self, obs, use_seg=True): - args = self.args - obs = obs.transpose(1, 2, 0) - rgb = obs[:, :, :3] - depth = obs[:, :, 3:4] - - sem_seg_pred = self._get_sem_pred( - rgb.astype(np.uint8), use_seg=use_seg) - depth = self._preprocess_depth(depth, args.min_depth, args.max_depth) - - ds = args.env_frame_width // args.frame_width # Downscaling factor - if ds != 1: - rgb = np.asarray(self.res(rgb.astype(np.uint8))) - depth = depth[ds // 2::ds, ds // 2::ds] - sem_seg_pred = sem_seg_pred[ds // 2::ds, ds // 2::ds] - - depth = np.expand_dims(depth, axis=2) - state = np.concatenate((rgb, depth, sem_seg_pred), - axis=2).transpose(2, 0, 1) - - return state - - def _preprocess_depth(self, depth, min_d, max_d): - depth = depth[:, :, 0] * 1 - - for i in range(depth.shape[1]): - depth[:, i][depth[:, i] == 0.] = depth[:, i].max() - - mask2 = depth > 0.99 - depth[mask2] = 0. - - mask1 = depth == 0 - depth[mask1] = 100.0 - depth = min_d * 100.0 + depth * max_d * 100.0 - return depth - - def _get_sem_pred(self, rgb, use_seg=True): - if use_seg: - semantic_pred, self.rgb_vis = self.sem_pred.get_prediction(rgb) - semantic_pred = semantic_pred.astype(np.float32) - else: - semantic_pred = np.zeros((rgb.shape[0], rgb.shape[1], 16)) - self.rgb_vis = rgb[:, :, ::-1] - return semantic_pred - - def _visualize(self, inputs): - args = self.args - dump_dir = "{}/dump/{}/".format(args.dump_location, - args.exp_name) - ep_dir = '{}/episodes/thread_{}/eps_{}/'.format( - dump_dir, self.rank, self.episode_no) - if not os.path.exists(ep_dir): - os.makedirs(ep_dir) - - map_pred = inputs['map_pred'] - exp_pred = inputs['exp_pred'] - start_x, start_y, start_o, gx1, gx2, gy1, gy2 = inputs['pose_pred'] - - goal = inputs['goal'] - sem_map = inputs['sem_map_pred'] - - gx1, gx2, gy1, gy2 = int(gx1), int(gx2), int(gy1), int(gy2) - - sem_map += 5 - - no_cat_mask = sem_map == 20 - map_mask = np.rint(map_pred) == 1 - exp_mask = np.rint(exp_pred) == 1 - vis_mask = self.visited_vis[gx1:gx2, gy1:gy2] == 1 - - sem_map[no_cat_mask] = 0 - m1 = np.logical_and(no_cat_mask, exp_mask) - sem_map[m1] = 2 - - m2 = np.logical_and(no_cat_mask, map_mask) - sem_map[m2] = 1 - - sem_map[vis_mask] = 3 - - selem = skimage.morphology.disk(4) - goal_mat = 1 - skimage.morphology.binary_dilation( - goal, selem) != True - - goal_mask = goal_mat == 1 - sem_map[goal_mask] = 4 - - color_pal = [int(x * 255.) for x in color_palette] - sem_map_vis = Image.new("P", (sem_map.shape[1], - sem_map.shape[0])) - sem_map_vis.putpalette(color_pal) - sem_map_vis.putdata(sem_map.flatten().astype(np.uint8)) - sem_map_vis = sem_map_vis.convert("RGB") - sem_map_vis = np.flipud(sem_map_vis) - - sem_map_vis = sem_map_vis[:, :, [2, 1, 0]] - sem_map_vis = cv2.resize(sem_map_vis, (480, 480), - interpolation=cv2.INTER_NEAREST) - self.vis_image[50:530, 15:655] = self.rgb_vis - self.vis_image[50:530, 670:1150] = sem_map_vis - - pos = ( - (start_x * 100. / args.map_resolution - gy1) - * 480 / map_pred.shape[0], - (map_pred.shape[1] - start_y * 100. / args.map_resolution + gx1) - * 480 / map_pred.shape[1], - np.deg2rad(-start_o) - ) - - agent_arrow = vu.get_contour_points(pos, origin=(670, 50)) - color = (int(color_palette[11] * 255), - int(color_palette[10] * 255), - int(color_palette[9] * 255)) - cv2.drawContours(self.vis_image, [agent_arrow], 0, color, -1) - - if args.visualize: - # Displaying the image - cv2.imshow("Thread {}".format(self.rank), self.vis_image) - cv2.waitKey(1) - - if args.print_images: - fn = '{}/episodes/thread_{}/eps_{}/{}-{}-Vis-{}.png'.format( - dump_dir, self.rank, self.episode_no, - self.rank, self.episode_no, self.timestep) - cv2.imwrite(fn, self.vis_image) diff --git a/algo/__init__.py b/algo/__init__.py deleted file mode 100644 index 91ac6f5..0000000 --- a/algo/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .ppo import PPO diff --git a/algo/ppo.py b/algo/ppo.py deleted file mode 100644 index e2ea796..0000000 --- a/algo/ppo.py +++ /dev/null @@ -1,108 +0,0 @@ -# The following code is largely borrowed from: -# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/algo/ppo.py - -import torch -import torch.nn as nn -import torch.optim as optim - - -class PPO(): - - def __init__( - self, - actor_critic, - clip_param, - ppo_epoch, - num_mini_batch, - value_loss_coef, - entropy_coef, - lr=None, - eps=None, - max_grad_norm=None, - use_clipped_value_loss=True): - - self.actor_critic = actor_critic - - self.clip_param = clip_param - self.ppo_epoch = ppo_epoch - self.num_mini_batch = num_mini_batch - - self.value_loss_coef = value_loss_coef - self.entropy_coef = entropy_coef - - self.max_grad_norm = max_grad_norm - self.use_clipped_value_loss = use_clipped_value_loss - - self.optimizer = optim.Adam(filter(lambda p: p.requires_grad, - actor_critic.parameters()), - lr=lr, eps=eps) - - def update(self, rollouts): - advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] - advantages = (advantages - advantages.mean()) / ( - advantages.std() + 1e-5) - - value_loss_epoch = 0 - action_loss_epoch = 0 - dist_entropy_epoch = 0 - - for _ in range(self.ppo_epoch): - - if self.actor_critic.is_recurrent: - data_generator = rollouts.recurrent_generator( - advantages, self.num_mini_batch) - else: - data_generator = rollouts.feed_forward_generator( - advantages, self.num_mini_batch) - - for sample in data_generator: - - value_preds = sample['value_preds'] - returns = sample['returns'] - adv_targ = sample['adv_targ'] - - # Reshape to do in a single forward pass for all steps - values, action_log_probs, dist_entropy, _ = \ - self.actor_critic.evaluate_actions( - sample['obs'], sample['rec_states'], - sample['masks'], sample['actions'], - extras=sample['extras'] - ) - - ratio = torch.exp(action_log_probs - - sample['old_action_log_probs']) - surr1 = ratio * adv_targ - surr2 = torch.clamp(ratio, 1.0 - self.clip_param, - 1.0 + self.clip_param) * adv_targ - action_loss = -torch.min(surr1, surr2).mean() - - if self.use_clipped_value_loss: - value_pred_clipped = value_preds + \ - (values - value_preds).clamp( - -self.clip_param, self.clip_param) - value_losses = (values - returns).pow(2) - value_losses_clipped = (value_pred_clipped - - returns).pow(2) - value_loss = .5 * torch.max(value_losses, - value_losses_clipped).mean() - else: - value_loss = 0.5 * (returns - values).pow(2).mean() - - self.optimizer.zero_grad() - (value_loss * self.value_loss_coef + action_loss - - dist_entropy * self.entropy_coef).backward() - nn.utils.clip_grad_norm_(self.actor_critic.parameters(), - self.max_grad_norm) - self.optimizer.step() - - value_loss_epoch += value_loss.item() - action_loss_epoch += action_loss.item() - dist_entropy_epoch += dist_entropy.item() - - num_updates = self.ppo_epoch * self.num_mini_batch - - value_loss_epoch /= num_updates - action_loss_epoch /= num_updates - dist_entropy_epoch /= num_updates - - return value_loss_epoch, action_loss_epoch, dist_entropy_epoch diff --git a/arguments.py b/arguments.py deleted file mode 100644 index baafb20..0000000 --- a/arguments.py +++ /dev/null @@ -1,230 +0,0 @@ -import argparse -import torch - - -def get_args(): - parser = argparse.ArgumentParser( - description='Goal-Oriented-Semantic-Exploration') - - # General Arguments - parser.add_argument('--seed', type=int, default=1, - help='random seed (default: 1)') - parser.add_argument('--auto_gpu_config', type=int, default=1) - parser.add_argument('--total_num_scenes', type=str, default="auto") - parser.add_argument('-n', '--num_processes', type=int, default=5, - help="""how many training processes to use (default:5) - Overridden when auto_gpu_config=1 - and training on gpus""") - parser.add_argument('--num_processes_per_gpu', type=int, default=6) - parser.add_argument('--num_processes_on_first_gpu', type=int, default=1) - parser.add_argument('--eval', type=int, default=0, - help='0: Train, 1: Evaluate (default: 0)') - parser.add_argument('--num_training_frames', type=int, default=10000000, - help='total number of training frames') - parser.add_argument('--num_eval_episodes', type=int, default=200, - help="number of test episodes per scene") - parser.add_argument('--num_train_episodes', type=int, default=10000, - help="""number of train episodes per scene - before loading the next scene""") - parser.add_argument('--no_cuda', action='store_true', default=False, - help='disables CUDA training') - parser.add_argument("--sim_gpu_id", type=int, default=0, - help="gpu id on which scenes are loaded") - parser.add_argument("--sem_gpu_id", type=int, default=-1, - help="""gpu id for semantic model, - -1: same as sim gpu, -2: cpu""") - - # Logging, loading models, visualization - parser.add_argument('--log_interval', type=int, default=10, - help="""log interval, one log per n updates - (default: 10) """) - parser.add_argument('--save_interval', type=int, default=1, - help="""save interval""") - parser.add_argument('-d', '--dump_location', type=str, default="./tmp/", - help='path to dump models and log (default: ./tmp/)') - parser.add_argument('--exp_name', type=str, default="exp1", - help='experiment name (default: exp1)') - parser.add_argument('--save_periodic', type=int, default=500000, - help='Model save frequency in number of updates') - parser.add_argument('--load', type=str, default="0", - help="""model path to load, - 0 to not reload (default: 0)""") - parser.add_argument('-v', '--visualize', type=int, default=0, - help="""1: Render the observation and - the predicted semantic map, - 2: Render the observation with semantic - predictions and the predicted semantic map - (default: 0)""") - parser.add_argument('--print_images', type=int, default=0, - help='1: save visualization as images') - - # Environment, dataset and episode specifications - parser.add_argument('-efw', '--env_frame_width', type=int, default=640, - help='Frame width (default:640)') - parser.add_argument('-efh', '--env_frame_height', type=int, default=480, - help='Frame height (default:480)') - parser.add_argument('-fw', '--frame_width', type=int, default=160, - help='Frame width (default:160)') - parser.add_argument('-fh', '--frame_height', type=int, default=120, - help='Frame height (default:120)') - parser.add_argument('-el', '--max_episode_length', type=int, default=500, - help="""Maximum episode length""") - parser.add_argument("--task_config", type=str, - default="tasks/objectnav_gibson.yaml", - help="path to config yaml containing task information") - parser.add_argument("--split", type=str, default="train", - help="dataset split (train | val | val_mini) ") - parser.add_argument('--camera_height', type=float, default=0.88, - help="agent camera height in metres") - parser.add_argument('--hfov', type=float, default=79.0, - help="horizontal field of view in degrees") - parser.add_argument('--turn_angle', type=float, default=30, - help="Agent turn angle in degrees") - parser.add_argument('--min_depth', type=float, default=0.5, - help="Minimum depth for depth sensor in meters") - parser.add_argument('--max_depth', type=float, default=5.0, - help="Maximum depth for depth sensor in meters") - parser.add_argument('--success_dist', type=float, default=1.0, - help="success distance threshold in meters") - parser.add_argument('--floor_thr', type=int, default=50, - help="floor threshold in cm") - parser.add_argument('--min_d', type=float, default=1.5, - help="min distance to goal during training in meters") - parser.add_argument('--max_d', type=float, default=100.0, - help="max distance to goal during training in meters") - parser.add_argument('--version', type=str, default="v1.1", - help="dataset version") - - # Model Hyperparameters - parser.add_argument('--agent', type=str, default="sem_exp") - parser.add_argument('--lr', type=float, default=2.5e-5, - help='learning rate (default: 2.5e-5)') - parser.add_argument('--global_hidden_size', type=int, default=256, - help='global_hidden_size') - parser.add_argument('--eps', type=float, default=1e-5, - help='RL Optimizer epsilon (default: 1e-5)') - parser.add_argument('--alpha', type=float, default=0.99, - help='RL Optimizer alpha (default: 0.99)') - parser.add_argument('--gamma', type=float, default=0.99, - help='discount factor for rewards (default: 0.99)') - parser.add_argument('--use_gae', action='store_true', default=False, - help='use generalized advantage estimation') - parser.add_argument('--tau', type=float, default=0.95, - help='gae parameter (default: 0.95)') - parser.add_argument('--entropy_coef', type=float, default=0.001, - help='entropy term coefficient (default: 0.01)') - parser.add_argument('--value_loss_coef', type=float, default=0.5, - help='value loss coefficient (default: 0.5)') - parser.add_argument('--max_grad_norm', type=float, default=0.5, - help='max norm of gradients (default: 0.5)') - parser.add_argument('--num_global_steps', type=int, default=20, - help='number of forward steps in A2C (default: 5)') - parser.add_argument('--ppo_epoch', type=int, default=4, - help='number of ppo epochs (default: 4)') - parser.add_argument('--num_mini_batch', type=str, default="auto", - help='number of batches for ppo (default: 32)') - parser.add_argument('--clip_param', type=float, default=0.2, - help='ppo clip parameter (default: 0.2)') - parser.add_argument('--use_recurrent_global', type=int, default=0, - help='use a recurrent global policy') - parser.add_argument('--num_local_steps', type=int, default=25, - help="""Number of steps the local policy - between each global step""") - parser.add_argument('--reward_coeff', type=float, default=0.1, - help="Object goal reward coefficient") - parser.add_argument('--intrinsic_rew_coeff', type=float, default=0.02, - help="intrinsic exploration reward coefficient") - parser.add_argument('--num_sem_categories', type=float, default=16) - parser.add_argument('--sem_pred_prob_thr', type=float, default=0.9, - help="Semantic prediction confidence threshold") - - # Mapping - parser.add_argument('--global_downscaling', type=int, default=2) - parser.add_argument('--vision_range', type=int, default=100) - parser.add_argument('--map_resolution', type=int, default=5) - parser.add_argument('--du_scale', type=int, default=1) - parser.add_argument('--map_size_cm', type=int, default=2400) - parser.add_argument('--cat_pred_threshold', type=float, default=5.0) - parser.add_argument('--map_pred_threshold', type=float, default=1.0) - parser.add_argument('--exp_pred_threshold', type=float, default=1.0) - parser.add_argument('--collision_threshold', type=float, default=0.20) - - # parse arguments - args = parser.parse_args() - - args.cuda = not args.no_cuda and torch.cuda.is_available() - - if args.cuda: - if args.auto_gpu_config: - num_gpus = torch.cuda.device_count() - if args.total_num_scenes != "auto": - args.total_num_scenes = int(args.total_num_scenes) - elif "objectnav_gibson" in args.task_config and \ - "train" in args.split: - args.total_num_scenes = 25 - elif "objectnav_gibson" in args.task_config and \ - "val" in args.split: - args.total_num_scenes = 5 - else: - assert False, "Unknown task config, please specify" + \ - " total_num_scenes" - - # GPU Memory required for the SemExp model: - # 0.8 + 0.4 * args.total_num_scenes (GB) - # GPU Memory required per thread: 2.6 (GB) - min_memory_required = max(0.8 + 0.4 * args.total_num_scenes, 2.6) - # Automatically configure number of training threads based on - # number of GPUs available and GPU memory size - gpu_memory = 1000 - for i in range(num_gpus): - gpu_memory = min(gpu_memory, - torch.cuda.get_device_properties( - i).total_memory - / 1024 / 1024 / 1024) - assert gpu_memory > min_memory_required, \ - """Insufficient GPU memory for GPU {}, gpu memory ({}GB) - needs to be greater than {}GB""".format( - i, gpu_memory, min_memory_required) - - num_processes_per_gpu = int(gpu_memory / 2.6) - num_processes_on_first_gpu = \ - int((gpu_memory - min_memory_required) / 2.6) - - if args.eval: - max_threads = num_processes_per_gpu * (num_gpus - 1) \ - + num_processes_on_first_gpu - assert max_threads >= args.total_num_scenes, \ - """Insufficient GPU memory for evaluation""" - - if num_gpus == 1: - args.num_processes_on_first_gpu = num_processes_on_first_gpu - args.num_processes_per_gpu = 0 - args.num_processes = num_processes_on_first_gpu - assert args.num_processes > 0, "Insufficient GPU memory" - else: - num_threads = num_processes_per_gpu * (num_gpus - 1) \ - + num_processes_on_first_gpu - num_threads = min(num_threads, args.total_num_scenes) - args.num_processes_per_gpu = num_processes_per_gpu - args.num_processes_on_first_gpu = max( - 0, - num_threads - args.num_processes_per_gpu * (num_gpus - 1)) - args.num_processes = num_threads - - args.sim_gpu_id = 1 - - print("Auto GPU config:") - print("Number of processes: {}".format(args.num_processes)) - print("Number of processes on GPU 0: {}".format( - args.num_processes_on_first_gpu)) - print("Number of processes per GPU: {}".format( - args.num_processes_per_gpu)) - else: - args.sem_gpu_id = -2 - - if args.num_mini_batch == "auto": - args.num_mini_batch = max(args.num_processes // 2, 1) - else: - args.num_mini_batch = int(args.num_mini_batch) - - return args diff --git a/configs/Base-RCNN-FPN.yaml b/configs/Base-RCNN-FPN.yaml deleted file mode 100644 index 3e020f2..0000000 --- a/configs/Base-RCNN-FPN.yaml +++ /dev/null @@ -1,42 +0,0 @@ -MODEL: - META_ARCHITECTURE: "GeneralizedRCNN" - BACKBONE: - NAME: "build_resnet_fpn_backbone" - RESNETS: - OUT_FEATURES: ["res2", "res3", "res4", "res5"] - FPN: - IN_FEATURES: ["res2", "res3", "res4", "res5"] - ANCHOR_GENERATOR: - SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map - ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps) - RPN: - IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"] - PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level - PRE_NMS_TOPK_TEST: 1000 # Per FPN level - # Detectron1 uses 2000 proposals per-batch, - # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue) - # which is approximately 1000 proposals per-image since the default batch size for FPN is 2. - POST_NMS_TOPK_TRAIN: 1000 - POST_NMS_TOPK_TEST: 1000 - ROI_HEADS: - NAME: "StandardROIHeads" - IN_FEATURES: ["p2", "p3", "p4", "p5"] - ROI_BOX_HEAD: - NAME: "FastRCNNConvFCHead" - NUM_FC: 2 - POOLER_RESOLUTION: 7 - ROI_MASK_HEAD: - NAME: "MaskRCNNConvUpsampleHead" - NUM_CONV: 4 - POOLER_RESOLUTION: 14 -DATASETS: - TRAIN: ("coco_2017_train",) - TEST: ("coco_2017_val",) -SOLVER: - IMS_PER_BATCH: 16 - BASE_LR: 0.02 - STEPS: (60000, 80000) - MAX_ITER: 90000 -INPUT: - MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) -VERSION: 2 diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml deleted file mode 100644 index be7d06b..0000000 --- a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml +++ /dev/null @@ -1,9 +0,0 @@ -_BASE_: "../Base-RCNN-FPN.yaml" -MODEL: - WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" - MASK_ON: True - RESNETS: - DEPTH: 50 -SOLVER: - STEPS: (210000, 250000) - MAX_ITER: 270000 diff --git a/constants.py b/constants.py deleted file mode 100644 index 1f0179e..0000000 --- a/constants.py +++ /dev/null @@ -1,94 +0,0 @@ -scenes = {} -scenes["train"] = [ - 'Allensville', - 'Beechwood', - 'Benevolence', - 'Coffeen', - 'Cosmos', - 'Forkland', - 'Hanson', - 'Hiteman', - 'Klickitat', - 'Lakeville', - 'Leonardo', - 'Lindenwood', - 'Marstons', - 'Merom', - 'Mifflinburg', - 'Newfields', - 'Onaga', - 'Pinesdale', - 'Pomaria', - 'Ranchester', - 'Shelbyville', - 'Stockman', - 'Tolstoy', - 'Wainscott', - 'Woodbine', -] - -scenes["val"] = [ - 'Collierville', - 'Corozal', - 'Darden', - 'Markleeville', - 'Wiconisco', -] - -coco_categories = { - "chair": 0, - "couch": 1, - "potted plant": 2, - "bed": 3, - "toilet": 4, - "tv": 5, - "dining-table": 6, - "oven": 7, - "sink": 8, - "refrigerator": 9, - "book": 10, - "clock": 11, - "vase": 12, - "cup": 13, - "bottle": 14 -} - -coco_categories_mapping = { - 56: 0, # chair - 57: 1, # couch - 58: 2, # potted plant - 59: 3, # bed - 61: 4, # toilet - 62: 5, # tv - 60: 6, # dining-table - 69: 7, # oven - 71: 8, # sink - 72: 9, # refrigerator - 73: 10, # book - 74: 11, # clock - 75: 12, # vase - 41: 13, # cup - 39: 14, # bottle -} - -color_palette = [ - 1.0, 1.0, 1.0, - 0.6, 0.6, 0.6, - 0.95, 0.95, 0.95, - 0.96, 0.36, 0.26, - 0.12156862745098039, 0.47058823529411764, 0.7058823529411765, - 0.9400000000000001, 0.7818, 0.66, - 0.9400000000000001, 0.8868, 0.66, - 0.8882000000000001, 0.9400000000000001, 0.66, - 0.7832000000000001, 0.9400000000000001, 0.66, - 0.6782000000000001, 0.9400000000000001, 0.66, - 0.66, 0.9400000000000001, 0.7468000000000001, - 0.66, 0.9400000000000001, 0.8518000000000001, - 0.66, 0.9232, 0.9400000000000001, - 0.66, 0.8182, 0.9400000000000001, - 0.66, 0.7132, 0.9400000000000001, - 0.7117999999999999, 0.66, 0.9400000000000001, - 0.8168, 0.66, 0.9400000000000001, - 0.9218, 0.66, 0.9400000000000001, - 0.9400000000000001, 0.66, 0.8531999999999998, - 0.9400000000000001, 0.66, 0.748199999999999] diff --git a/envs/habitat/__init__.py b/envs/habitat/__init__.py deleted file mode 100644 index e04b9ed..0000000 --- a/envs/habitat/__init__.py +++ /dev/null @@ -1,150 +0,0 @@ -# Parts of the code in this file have been borrowed from: -# https://github.com/facebookresearch/habitat-api -import os -import numpy as np -import torch -from habitat.config.default import get_config as cfg_env -from habitat.datasets.pointnav.pointnav_dataset import PointNavDatasetV1 -from habitat import Config, Env, RLEnv, VectorEnv, make_dataset - -from agents.sem_exp import Sem_Exp_Env_Agent -from .objectgoal_env import ObjectGoal_Env - -from .utils.vector_env import VectorEnv - - -def make_env_fn(args, config_env, rank): - dataset = make_dataset(config_env.DATASET.TYPE, config=config_env.DATASET) - config_env.defrost() - config_env.SIMULATOR.SCENE = dataset.episodes[0].scene_id - config_env.freeze() - - if args.agent == "sem_exp": - env = Sem_Exp_Env_Agent(args=args, rank=rank, - config_env=config_env, - dataset=dataset - ) - else: - env = ObjectGoal_Env(args=args, rank=rank, - config_env=config_env, - dataset=dataset - ) - - env.seed(rank) - return env - - -def _get_scenes_from_folder(content_dir): - scene_dataset_ext = ".glb.json.gz" - scenes = [] - for filename in os.listdir(content_dir): - if filename.endswith(scene_dataset_ext): - scene = filename[: -len(scene_dataset_ext) + 4] - scenes.append(scene) - scenes.sort() - return scenes - - -def construct_envs(args): - env_configs = [] - args_list = [] - - basic_config = cfg_env(config_paths=["envs/habitat/configs/" - + args.task_config]) - basic_config.defrost() - basic_config.DATASET.SPLIT = args.split - basic_config.DATASET.DATA_PATH = \ - basic_config.DATASET.DATA_PATH.replace("v1", args.version) - basic_config.DATASET.EPISODES_DIR = \ - basic_config.DATASET.EPISODES_DIR.replace("v1", args.version) - basic_config.freeze() - - scenes = basic_config.DATASET.CONTENT_SCENES - if "*" in basic_config.DATASET.CONTENT_SCENES: - content_dir = os.path.join(basic_config.DATASET.EPISODES_DIR.format( - split=args.split), "content") - scenes = _get_scenes_from_folder(content_dir) - - if len(scenes) > 0: - assert len(scenes) >= args.num_processes, ( - "reduce the number of processes as there " - "aren't enough number of scenes" - ) - - scene_split_sizes = [int(np.floor(len(scenes) / args.num_processes)) - for _ in range(args.num_processes)] - for i in range(len(scenes) % args.num_processes): - scene_split_sizes[i] += 1 - - print("Scenes per thread:") - for i in range(args.num_processes): - config_env = cfg_env(config_paths=["envs/habitat/configs/" - + args.task_config]) - config_env.defrost() - - if len(scenes) > 0: - config_env.DATASET.CONTENT_SCENES = scenes[ - sum(scene_split_sizes[:i]): - sum(scene_split_sizes[:i + 1]) - ] - print("Thread {}: {}".format(i, config_env.DATASET.CONTENT_SCENES)) - - if i < args.num_processes_on_first_gpu: - gpu_id = 0 - else: - gpu_id = int((i - args.num_processes_on_first_gpu) - // args.num_processes_per_gpu) + args.sim_gpu_id - gpu_id = min(torch.cuda.device_count() - 1, gpu_id) - config_env.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = gpu_id - - agent_sensors = [] - agent_sensors.append("RGB_SENSOR") - agent_sensors.append("DEPTH_SENSOR") - # agent_sensors.append("SEMANTIC_SENSOR") - - config_env.SIMULATOR.AGENT_0.SENSORS = agent_sensors - - # Reseting episodes manually, setting high max episode length in sim - config_env.ENVIRONMENT.MAX_EPISODE_STEPS = 10000000 - config_env.ENVIRONMENT.ITERATOR_OPTIONS.SHUFFLE = False - - config_env.SIMULATOR.RGB_SENSOR.WIDTH = args.env_frame_width - config_env.SIMULATOR.RGB_SENSOR.HEIGHT = args.env_frame_height - config_env.SIMULATOR.RGB_SENSOR.HFOV = args.hfov - config_env.SIMULATOR.RGB_SENSOR.POSITION = [0, args.camera_height, 0] - - config_env.SIMULATOR.DEPTH_SENSOR.WIDTH = args.env_frame_width - config_env.SIMULATOR.DEPTH_SENSOR.HEIGHT = args.env_frame_height - config_env.SIMULATOR.DEPTH_SENSOR.HFOV = args.hfov - config_env.SIMULATOR.DEPTH_SENSOR.MIN_DEPTH = args.min_depth - config_env.SIMULATOR.DEPTH_SENSOR.MAX_DEPTH = args.max_depth - config_env.SIMULATOR.DEPTH_SENSOR.POSITION = [0, args.camera_height, 0] - - # config_env.SIMULATOR.SEMANTIC_SENSOR.WIDTH = args.env_frame_width - # config_env.SIMULATOR.SEMANTIC_SENSOR.HEIGHT = args.env_frame_height - # config_env.SIMULATOR.SEMANTIC_SENSOR.HFOV = args.hfov - # config_env.SIMULATOR.SEMANTIC_SENSOR.POSITION = \ - # [0, args.camera_height, 0] - - config_env.SIMULATOR.TURN_ANGLE = args.turn_angle - config_env.DATASET.SPLIT = args.split - config_env.DATASET.DATA_PATH = \ - config_env.DATASET.DATA_PATH.replace("v1", args.version) - config_env.DATASET.EPISODES_DIR = \ - config_env.DATASET.EPISODES_DIR.replace("v1", args.version) - - config_env.freeze() - env_configs.append(config_env) - - args_list.append(args) - - envs = VectorEnv( - make_env_fn=make_env_fn, - env_fn_args=tuple( - tuple( - zip(args_list, env_configs, range(args.num_processes)) - ) - ), - ) - - return envs diff --git a/envs/habitat/configs/tasks/objectnav_gibson.yaml b/envs/habitat/configs/tasks/objectnav_gibson.yaml deleted file mode 100644 index d0b7d92..0000000 --- a/envs/habitat/configs/tasks/objectnav_gibson.yaml +++ /dev/null @@ -1,44 +0,0 @@ -ENVIRONMENT: - MAX_EPISODE_STEPS: 500 -SIMULATOR: - TURN_ANGLE: 30 - TILT_ANGLE: 30 - ACTION_SPACE_CONFIG: "v1" - AGENT_0: - SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR', 'SEMANTIC_SENSOR'] - HEIGHT: 0.88 - RADIUS: 0.18 - HABITAT_SIM_V0: - GPU_DEVICE_ID: 0 - ALLOW_SLIDING: True - SEMANTIC_SENSOR: - WIDTH: 640 - HEIGHT: 480 - HFOV: 79 - POSITION: [0, 0.88, 0] - RGB_SENSOR: - WIDTH: 640 - HEIGHT: 480 - HFOV: 79 - POSITION: [0, 0.88, 0] - DEPTH_SENSOR: - WIDTH: 640 - HEIGHT: 480 - HFOV: 79 - MIN_DEPTH: 0.5 - MAX_DEPTH: 5.0 - POSITION: [0, 0.88, 0] -TASK: - TYPE: ObjectNav-v1 - POSSIBLE_ACTIONS: ["STOP", "MOVE_FORWARD", "TURN_LEFT", "TURN_RIGHT", "LOOK_UP", "LOOK_DOWN"] - SENSORS: ['GPS_SENSOR', 'COMPASS_SENSOR'] - MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL'] - SUCCESS: - SUCCESS_DISTANCE: 0.2 - -DATASET: - TYPE: PointNav-v1 - SPLIT: train - DATA_PATH: "data/datasets/objectnav/gibson/v1/{split}/{split}.json.gz" - EPISODES_DIR: "data/datasets/objectnav/gibson/v1/{split}/" - SCENES_DIR: "data/scene_datasets/" diff --git a/envs/habitat/objectgoal_env.py b/envs/habitat/objectgoal_env.py deleted file mode 100644 index a08dd55..0000000 --- a/envs/habitat/objectgoal_env.py +++ /dev/null @@ -1,465 +0,0 @@ -import json -import bz2 -import gzip -import _pickle as cPickle -import gym -import numpy as np -import quaternion -import skimage.morphology -import habitat - -from envs.utils.fmm_planner import FMMPlanner -from constants import coco_categories -import envs.utils.pose as pu - - -class ObjectGoal_Env(habitat.RLEnv): - """The Object Goal Navigation environment class. The class is responsible - for loading the dataset, generating episodes, and computing evaluation - metrics. - """ - - def __init__(self, args, rank, config_env, dataset): - self.args = args - self.rank = rank - - super().__init__(config_env, dataset) - - # Loading dataset info file - self.split = config_env.DATASET.SPLIT - self.episodes_dir = config_env.DATASET.EPISODES_DIR.format( - split=self.split) - - dataset_info_file = self.episodes_dir + \ - "{split}_info.pbz2".format(split=self.split) - with bz2.BZ2File(dataset_info_file, 'rb') as f: - self.dataset_info = cPickle.load(f) - - # Specifying action and observation space - self.action_space = gym.spaces.Discrete(3) - - self.observation_space = gym.spaces.Box(0, 255, - (3, args.frame_height, - args.frame_width), - dtype='uint8') - - # Initializations - self.episode_no = 0 - - # Scene info - self.last_scene_path = None - self.scene_path = None - self.scene_name = None - - # Episode Dataset info - self.eps_data = None - self.eps_data_idx = None - self.gt_planner = None - self.object_boundary = None - self.goal_idx = None - self.goal_name = None - self.map_obj_origin = None - self.starting_loc = None - self.starting_distance = None - - # Episode tracking info - self.curr_distance = None - self.prev_distance = None - self.timestep = None - self.stopped = None - self.path_length = None - self.last_sim_location = None - self.trajectory_states = [] - self.info = {} - self.info['distance_to_goal'] = None - self.info['spl'] = None - self.info['success'] = None - - def load_new_episode(self): - """The function loads a fixed episode from the episode dataset. This - function is used for evaluating a trained model on the val split. - """ - - args = self.args - self.scene_path = self.habitat_env.sim.config.SCENE - scene_name = self.scene_path.split("/")[-1].split(".")[0] - - if self.scene_path != self.last_scene_path: - episodes_file = self.episodes_dir + \ - "content/{}_episodes.json.gz".format(scene_name) - - print("Loading episodes from: {}".format(episodes_file)) - with gzip.open(episodes_file, 'r') as f: - self.eps_data = json.loads( - f.read().decode('utf-8'))["episodes"] - - self.eps_data_idx = 0 - self.last_scene_path = self.scene_path - - # Load episode info - episode = self.eps_data[self.eps_data_idx] - self.eps_data_idx += 1 - self.eps_data_idx = self.eps_data_idx % len(self.eps_data) - pos = episode["start_position"] - rot = quaternion.from_float_array(episode["start_rotation"]) - - goal_name = episode["object_category"] - goal_idx = episode["object_id"] - floor_idx = episode["floor_id"] - - # Load scene info - scene_info = self.dataset_info[scene_name] - sem_map = scene_info[floor_idx]['sem_map'] - map_obj_origin = scene_info[floor_idx]['origin'] - - # Setup ground truth planner - object_boundary = args.success_dist - map_resolution = args.map_resolution - selem = skimage.morphology.disk(2) - traversible = skimage.morphology.binary_dilation( - sem_map[0], selem) != True - traversible = 1 - traversible - planner = FMMPlanner(traversible) - selem = skimage.morphology.disk( - int(object_boundary * 100. / map_resolution)) - goal_map = skimage.morphology.binary_dilation( - sem_map[goal_idx + 1], selem) != True - goal_map = 1 - goal_map - planner.set_multi_goal(goal_map) - - # Get starting loc in GT map coordinates - x = -pos[2] - y = -pos[0] - min_x, min_y = map_obj_origin / 100.0 - map_loc = int((-y - min_y) * 20.), int((-x - min_x) * 20.) - - self.gt_planner = planner - self.starting_loc = map_loc - self.object_boundary = object_boundary - self.goal_idx = goal_idx - self.goal_name = goal_name - self.map_obj_origin = map_obj_origin - - self.starting_distance = self.gt_planner.fmm_dist[self.starting_loc]\ - / 20.0 + self.object_boundary - self.prev_distance = self.starting_distance - self._env.sim.set_agent_state(pos, rot) - - # The following two should match approximately - # print(starting_loc) - # print(self.sim_continuous_to_sim_map(self.get_sim_location())) - - obs = self._env.sim.get_observations_at(pos, rot) - - return obs - - def generate_new_episode(self): - """The function generates a random valid episode. This function is used - for training a model on the train split. - """ - - args = self.args - - self.scene_path = self.habitat_env.sim.config.SCENE - scene_name = self.scene_path.split("/")[-1].split(".")[0] - - scene_info = self.dataset_info[scene_name] - map_resolution = args.map_resolution - - floor_idx = np.random.randint(len(scene_info.keys())) - floor_height = scene_info[floor_idx]['floor_height'] - sem_map = scene_info[floor_idx]['sem_map'] - map_obj_origin = scene_info[floor_idx]['origin'] - - cat_counts = sem_map.sum(2).sum(1) - possible_cats = list(np.arange(6)) - - for i in range(6): - if cat_counts[i + 1] == 0: - possible_cats.remove(i) - - object_boundary = args.success_dist - - loc_found = False - while not loc_found: - if len(possible_cats) == 0: - print("No valid objects for {}".format(floor_height)) - eps = eps - 1 - continue - - goal_idx = np.random.choice(possible_cats) - - for key, value in coco_categories.items(): - if value == goal_idx: - goal_name = key - - selem = skimage.morphology.disk(2) - traversible = skimage.morphology.binary_dilation( - sem_map[0], selem) != True - traversible = 1 - traversible - - planner = FMMPlanner(traversible) - - selem = skimage.morphology.disk( - int(object_boundary * 100. / map_resolution)) - goal_map = skimage.morphology.binary_dilation( - sem_map[goal_idx + 1], selem) != True - goal_map = 1 - goal_map - - planner.set_multi_goal(goal_map) - - m1 = sem_map[0] > 0 - m2 = planner.fmm_dist > (args.min_d - object_boundary) * 20.0 - m3 = planner.fmm_dist < (args.max_d - object_boundary) * 20.0 - - possible_starting_locs = np.logical_and(m1, m2) - possible_starting_locs = np.logical_and( - possible_starting_locs, m3) * 1. - if possible_starting_locs.sum() != 0: - loc_found = True - else: - print("Invalid object: {} / {} / {}".format( - scene_name, floor_height, goal_name)) - possible_cats.remove(goal_idx) - scene_info[floor_idx]["sem_map"][goal_idx + 1, :, :] = 0. - self.dataset_info[scene_name][floor_idx][ - "sem_map"][goal_idx + 1, :, :] = 0. - - loc_found = False - while not loc_found: - pos = self._env.sim.sample_navigable_point() - x = -pos[2] - y = -pos[0] - min_x, min_y = map_obj_origin / 100.0 - map_loc = int((-y - min_y) * 20.), int((-x - min_x) * 20.) - if abs(pos[1] - floor_height) < args.floor_thr / 100.0 and \ - possible_starting_locs[map_loc[0], map_loc[1]] == 1: - loc_found = True - - agent_state = self._env.sim.get_agent_state(0) - rotation = agent_state.rotation - rvec = quaternion.as_rotation_vector(rotation) - rvec[1] = np.random.rand() * 2 * np.pi - rot = quaternion.from_rotation_vector(rvec) - - self.gt_planner = planner - self.starting_loc = map_loc - self.object_boundary = object_boundary - self.goal_idx = goal_idx - self.goal_name = goal_name - self.map_obj_origin = map_obj_origin - - self.starting_distance = self.gt_planner.fmm_dist[self.starting_loc] \ - / 20.0 + self.object_boundary - self.prev_distance = self.starting_distance - - self._env.sim.set_agent_state(pos, rot) - - # The following two should match approximately - # print(starting_loc) - # print(self.sim_continuous_to_sim_map(self.get_sim_location())) - - obs = self._env.sim.get_observations_at(pos, rot) - - return obs - - def sim_map_to_sim_continuous(self, coords): - """Converts ground-truth 2D Map coordinates to absolute Habitat - simulator position and rotation. - """ - agent_state = self._env.sim.get_agent_state(0) - y, x = coords - min_x, min_y = self.map_obj_origin / 100.0 - - cont_x = x / 20. + min_x - cont_y = y / 20. + min_y - agent_state.position[0] = cont_y - agent_state.position[2] = cont_x - - rotation = agent_state.rotation - rvec = quaternion.as_rotation_vector(rotation) - - if self.args.train_single_eps: - rvec[1] = 0.0 - else: - rvec[1] = np.random.rand() * 2 * np.pi - rot = quaternion.from_rotation_vector(rvec) - - return agent_state.position, rot - - def sim_continuous_to_sim_map(self, sim_loc): - """Converts absolute Habitat simulator pose to ground-truth 2D Map - coordinates. - """ - x, y, o = sim_loc - min_x, min_y = self.map_obj_origin / 100.0 - x, y = int((-x - min_x) * 20.), int((-y - min_y) * 20.) - - o = np.rad2deg(o) + 180.0 - return y, x, o - - def reset(self): - """Resets the environment to a new episode. - - Returns: - obs (ndarray): RGBD observations (4 x H x W) - info (dict): contains timestep, pose, goal category and - evaluation metric info - """ - args = self.args - new_scene = self.episode_no % args.num_train_episodes == 0 - - self.episode_no += 1 - - # Initializations - self.timestep = 0 - self.stopped = False - self.path_length = 1e-5 - self.trajectory_states = [] - - if new_scene: - obs = super().reset() - self.scene_name = self.habitat_env.sim.config.SCENE - print("Changing scene: {}/{}".format(self.rank, self.scene_name)) - - self.scene_path = self.habitat_env.sim.config.SCENE - - if self.split == "val": - obs = self.load_new_episode() - else: - obs = self.generate_new_episode() - - rgb = obs['rgb'].astype(np.uint8) - depth = obs['depth'] - state = np.concatenate((rgb, depth), axis=2).transpose(2, 0, 1) - self.last_sim_location = self.get_sim_location() - - # Set info - self.info['time'] = self.timestep - self.info['sensor_pose'] = [0., 0., 0.] - self.info['goal_cat_id'] = self.goal_idx - self.info['goal_name'] = self.goal_name - - return state, self.info - - def step(self, action): - """Function to take an action in the environment. - - Args: - action (dict): - dict with following keys: - 'action' (int): 0: stop, 1: forward, 2: left, 3: right - - Returns: - obs (ndarray): RGBD observations (4 x H x W) - reward (float): amount of reward returned after previous action - done (bool): whether the episode has ended - info (dict): contains timestep, pose, goal category and - evaluation metric info - """ - action = action["action"] - if action == 0: - self.stopped = True - # Not sending stop to simulator, resetting manually - action = 3 - - obs, rew, done, _ = super().step(action) - - # Get pose change - dx, dy, do = self.get_pose_change() - self.info['sensor_pose'] = [dx, dy, do] - self.path_length += pu.get_l2_distance(0, dx, 0, dy) - - spl, success, dist = 0., 0., 0. - if done: - spl, success, dist = self.get_metrics() - self.info['distance_to_goal'] = dist - self.info['spl'] = spl - self.info['success'] = success - - rgb = obs['rgb'].astype(np.uint8) - depth = obs['depth'] - state = np.concatenate((rgb, depth), axis=2).transpose(2, 0, 1) - - self.timestep += 1 - self.info['time'] = self.timestep - - return state, rew, done, self.info - - def get_reward_range(self): - """This function is not used, Habitat-RLEnv requires this function""" - return (0., 1.0) - - def get_reward(self, observations): - curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location()) - self.curr_distance = self.gt_planner.fmm_dist[curr_loc[0], - curr_loc[1]] / 20.0 - - reward = (self.prev_distance - self.curr_distance) * \ - self.args.reward_coeff - - self.prev_distance = self.curr_distance - return reward - - def get_metrics(self): - """This function computes evaluation metrics for the Object Goal task - - Returns: - spl (float): Success weighted by Path Length - (See https://arxiv.org/pdf/1807.06757.pdf) - success (int): 0: Failure, 1: Successful - dist (float): Distance to Success (DTS), distance of the agent - from the success threshold boundary in meters. - (See https://arxiv.org/pdf/2007.00643.pdf) - """ - curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location()) - dist = self.gt_planner.fmm_dist[curr_loc[0], curr_loc[1]] / 20.0 - if dist == 0.0: - success = 1 - else: - success = 0 - spl = min(success * self.starting_distance / self.path_length, 1) - return spl, success, dist - - def get_done(self, observations): - if self.info['time'] >= self.args.max_episode_length - 1: - done = True - elif self.stopped: - done = True - else: - done = False - return done - - def get_info(self, observations): - """This function is not used, Habitat-RLEnv requires this function""" - info = {} - return info - - def get_spaces(self): - """Returns observation and action spaces for the ObjectGoal task.""" - return self.observation_space, self.action_space - - def get_sim_location(self): - """Returns x, y, o pose of the agent in the Habitat simulator.""" - - agent_state = super().habitat_env.sim.get_agent_state(0) - x = -agent_state.position[2] - y = -agent_state.position[0] - axis = quaternion.as_euler_angles(agent_state.rotation)[0] - if (axis % (2 * np.pi)) < 0.1 or (axis % - (2 * np.pi)) > 2 * np.pi - 0.1: - o = quaternion.as_euler_angles(agent_state.rotation)[1] - else: - o = 2 * np.pi - quaternion.as_euler_angles(agent_state.rotation)[1] - if o > np.pi: - o -= 2 * np.pi - return x, y, o - - def get_pose_change(self): - """Returns dx, dy, do pose change of the agent relative to the last - timestep.""" - curr_sim_pose = self.get_sim_location() - dx, dy, do = pu.get_rel_pose_change( - curr_sim_pose, self.last_sim_location) - self.last_sim_location = curr_sim_pose - return dx, dy, do diff --git a/envs/habitat/utils/vector_env.py b/envs/habitat/utils/vector_env.py deleted file mode 100644 index 389300a..0000000 --- a/envs/habitat/utils/vector_env.py +++ /dev/null @@ -1,586 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) Facebook, Inc. and its affiliates. -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from multiprocessing.connection import Connection -from multiprocessing.context import BaseContext -from queue import Queue -from threading import Thread -from typing import ( - Any, - Callable, - Dict, - List, - Optional, - Sequence, - Set, - Tuple, - Union, -) - -import gym -import numpy as np -from gym.spaces.dict_space import Dict as SpaceDict - -import habitat -from habitat.config import Config -from habitat.core.env import Env, Observations, RLEnv -from habitat.core.logging import logger -from habitat.core.utils import tile_images - -try: - # Use torch.multiprocessing if we can. - # We have yet to find a reason to not use it and - # you are required to use it when sending a torch.Tensor - # between processes - import torch.multiprocessing as mp -except ImportError: - import multiprocessing as mp - -STEP_COMMAND = "step" -RESET_COMMAND = "reset" -RENDER_COMMAND = "render" -CLOSE_COMMAND = "close" -OBSERVATION_SPACE_COMMAND = "observation_space" -ACTION_SPACE_COMMAND = "action_space" -CALL_COMMAND = "call" -EPISODE_COMMAND = "current_episode" -PLAN_ACT_AND_PREPROCESS = "plan_act_and_preprocess" -COUNT_EPISODES_COMMAND = "count_episodes" -EPISODE_OVER = "episode_over" -GET_METRICS = "get_metrics" - - -def _make_env_fn( - config: Config, dataset: Optional[habitat.Dataset] = None, rank: int = 0 -) -> Env: - """Constructor for default habitat `env.Env`. - - :param config: configuration for environment. - :param dataset: dataset for environment. - :param rank: rank for setting seed of environment - :return: `env.Env` / `env.RLEnv` object - """ - habitat_env = Env(config=config, dataset=dataset) - habitat_env.seed(config.SEED + rank) - return habitat_env - - -class VectorEnv: - r"""Vectorized environment which creates multiple processes where each - process runs its own environment. Main class for parallelization of - training and evaluation. - - - All the environments are synchronized on step and reset methods. - """ - - observation_spaces: List[SpaceDict] - action_spaces: List[SpaceDict] - _workers: List[Union[mp.Process, Thread]] - _is_waiting: bool - _num_envs: int - _auto_reset_done: bool - _mp_ctx: BaseContext - _connection_read_fns: List[Callable[[], Any]] - _connection_write_fns: List[Callable[[Any], None]] - - def __init__( - self, - make_env_fn: Callable[..., Union[Env, RLEnv]] = _make_env_fn, - env_fn_args: Sequence[Tuple] = None, - auto_reset_done: bool = True, - multiprocessing_start_method: str = "forkserver", - ) -> None: - """.. - - :param make_env_fn: function which creates a single environment. An - environment can be of type `env.Env` or `env.RLEnv` - :param env_fn_args: tuple of tuple of args to pass to the - `_make_env_fn`. - :param auto_reset_done: automatically reset the environment when - done. This functionality is provided for seamless training - of vectorized environments. - :param multiprocessing_start_method: the multiprocessing method used to - spawn worker processes. Valid methods are - :py:`{'spawn', 'forkserver', 'fork'}`; :py:`'forkserver'` is the - recommended method as it works well with CUDA. If :py:`'fork'` is - used, the subproccess must be started before any other GPU useage. - """ - self._is_waiting = False - self._is_closed = True - - assert ( - env_fn_args is not None and len(env_fn_args) > 0 - ), "number of environments to be created should be greater than 0" - - self._num_envs = len(env_fn_args) - - assert multiprocessing_start_method in self._valid_start_methods, ( - "multiprocessing_start_method must be one of {}. Got '{}'" - ).format(self._valid_start_methods, multiprocessing_start_method) - self._auto_reset_done = auto_reset_done - self._mp_ctx = mp.get_context(multiprocessing_start_method) - self._workers = [] - ( - self._connection_read_fns, - self._connection_write_fns, - ) = self._spawn_workers( # noqa - env_fn_args, make_env_fn - ) - - self._is_closed = False - - for write_fn in self._connection_write_fns: - write_fn((OBSERVATION_SPACE_COMMAND, None)) - self.observation_spaces = [ - read_fn() for read_fn in self._connection_read_fns - ] - for write_fn in self._connection_write_fns: - write_fn((ACTION_SPACE_COMMAND, None)) - self.action_spaces = [ - read_fn() for read_fn in self._connection_read_fns - ] - self.observation_space = self.observation_spaces[0] - self.action_space = self.action_spaces[0] - self._paused = [] - - @property - def num_envs(self): - r"""number of individual environments. - """ - return self._num_envs - len(self._paused) - - @staticmethod - def _worker_env( - connection_read_fn: Callable, - connection_write_fn: Callable, - env_fn: Callable, - env_fn_args: Tuple[Any], - auto_reset_done: bool, - child_pipe: Optional[Connection] = None, - parent_pipe: Optional[Connection] = None, - ) -> None: - r"""process worker for creating and interacting with the environment. - """ - env = env_fn(*env_fn_args) - if parent_pipe is not None: - parent_pipe.close() - try: - command, data = connection_read_fn() - while command != CLOSE_COMMAND: - if command == STEP_COMMAND: - # different step methods for habitat.RLEnv and habitat.Env - if isinstance(env, habitat.RLEnv) or isinstance( - env, gym.Env - ): - # habitat.RLEnv - observations, reward, done, info = env.step(**data) - if auto_reset_done and done: - observations, info = env.reset() - connection_write_fn((observations, reward, done, info)) - elif isinstance(env, habitat.Env): - # habitat.Env - observations = env.step(**data) - if auto_reset_done and env.episode_over: - observations = env.reset() - connection_write_fn(observations) - else: - raise NotImplementedError - - elif command == RESET_COMMAND: - observations = env.reset() - connection_write_fn(observations) - - elif command == RENDER_COMMAND: - connection_write_fn(env.render(*data[0], **data[1])) - - elif ( - command == OBSERVATION_SPACE_COMMAND - or command == ACTION_SPACE_COMMAND - ): - if isinstance(command, str): - connection_write_fn(getattr(env, command)) - - elif command == CALL_COMMAND: - function_name, function_args = data - if function_args is None or len(function_args) == 0: - result = getattr(env, function_name)() - else: - result = getattr(env, function_name)(**function_args) - connection_write_fn(result) - - # TODO: update CALL_COMMAND for getting attribute like this - elif command == EPISODE_COMMAND: - connection_write_fn(env.current_episode) - - elif command == PLAN_ACT_AND_PREPROCESS: - observations, reward, done, info = \ - env.plan_act_and_preprocess(data) - if auto_reset_done and done: - observations, info = env.reset() - connection_write_fn((observations, reward, done, info)) - - elif command == COUNT_EPISODES_COMMAND: - connection_write_fn(len(env.episodes)) - - elif command == EPISODE_OVER: - connection_write_fn(env.episode_over) - - elif command == GET_METRICS: - result = env.get_metrics() - connection_write_fn(result) - - else: - raise NotImplementedError - - command, data = connection_read_fn() - - if child_pipe is not None: - child_pipe.close() - except KeyboardInterrupt: - logger.info("Worker KeyboardInterrupt") - finally: - env.close() - - def _spawn_workers( - self, - env_fn_args: Sequence[Tuple], - make_env_fn: Callable[..., Union[Env, RLEnv]] = _make_env_fn, - ) -> Tuple[List[Callable[[], Any]], List[Callable[[Any], None]]]: - parent_connections, worker_connections = zip( - *[self._mp_ctx.Pipe(duplex=True) for _ in range(self._num_envs)] - ) - self._workers = [] - for worker_conn, parent_conn, env_args in zip( - worker_connections, parent_connections, env_fn_args - ): - ps = self._mp_ctx.Process( - target=self._worker_env, - args=( - worker_conn.recv, - worker_conn.send, - make_env_fn, - env_args, - self._auto_reset_done, - worker_conn, - parent_conn, - ), - ) - self._workers.append(ps) - ps.daemon = True - ps.start() - worker_conn.close() - return ( - [p.recv for p in parent_connections], - [p.send for p in parent_connections], - ) - - def current_episodes(self): - self._is_waiting = True - for write_fn in self._connection_write_fns: - write_fn((EPISODE_COMMAND, None)) - results = [] - for read_fn in self._connection_read_fns: - results.append(read_fn()) - self._is_waiting = False - return results - - def count_episodes(self): - self._is_waiting = True - for write_fn in self._connection_write_fns: - write_fn((COUNT_EPISODES_COMMAND, None)) - results = [] - for read_fn in self._connection_read_fns: - results.append(read_fn()) - self._is_waiting = False - return results - - def episode_over(self): - self._is_waiting = True - for write_fn in self._connection_write_fns: - write_fn((EPISODE_OVER, None)) - results = [] - for read_fn in self._connection_read_fns: - results.append(read_fn()) - self._is_waiting = False - return results - - def get_metrics(self): - self._is_waiting = True - for write_fn in self._connection_write_fns: - write_fn((GET_METRICS, None)) - results = [] - for read_fn in self._connection_read_fns: - results.append(read_fn()) - self._is_waiting = False - return results - - def reset(self): - r"""Reset all the vectorized environments - - :return: list of outputs from the reset method of envs. - """ - self._is_waiting = True - for write_fn in self._connection_write_fns: - write_fn((RESET_COMMAND, None)) - results = [] - for read_fn in self._connection_read_fns: - results.append(read_fn()) - obs, infos = zip(*results) - - self._is_waiting = False - return np.stack(obs), infos - - def reset_at(self, index_env: int): - r"""Reset in the index_env environment in the vector. - - :param index_env: index of the environment to be reset - :return: list containing the output of reset method of indexed env. - """ - self._is_waiting = True - self._connection_write_fns[index_env]((RESET_COMMAND, None)) - results = [self._connection_read_fns[index_env]()] - self._is_waiting = False - return results - - def step_at(self, index_env: int, action: Dict[str, Any]): - r"""Step in the index_env environment in the vector. - - :param index_env: index of the environment to be stepped into - :param action: action to be taken - :return: list containing the output of step method of indexed env. - """ - self._is_waiting = True - self._connection_write_fns[index_env]((STEP_COMMAND, action)) - results = [self._connection_read_fns[index_env]()] - self._is_waiting = False - return results - - def step_async(self, data: List[Union[int, str, Dict[str, Any]]]) -> None: - r"""Asynchronously step in the environments. - - :param data: list of size _num_envs containing keyword arguments to - pass to `step` method for each Environment. For example, - :py:`[{"action": "TURN_LEFT", "action_args": {...}}, ...]`. - """ - # Backward compatibility - if isinstance(data[0], (int, np.integer, str)): - data = [{"action": {"action": action}} for action in data] - - self._is_waiting = True - for write_fn, args in zip(self._connection_write_fns, data): - write_fn((STEP_COMMAND, args)) - - def step_wait(self) -> List[Observations]: - r"""Wait until all the asynchronized environments have synchronized. - """ - results = [] - for read_fn in self._connection_read_fns: - results.append(read_fn()) - self._is_waiting = False - obs, rews, dones, infos = zip(*results) - return np.stack(obs), np.stack(rews), np.stack(dones), infos - - def step(self, data: List[Union[int, str, Dict[str, Any]]]) -> List[Any]: - r"""Perform actions in the vectorized environments. - - :param data: list of size _num_envs containing keyword arguments to - pass to `step` method for each Environment. For example, - :py:`[{"action": "TURN_LEFT", "action_args": {...}}, ...]`. - :return: list of outputs from the step method of envs. - """ - self.step_async(data) - return self.step_wait() - - def close(self) -> None: - if self._is_closed: - return - - if self._is_waiting: - for read_fn in self._connection_read_fns: - read_fn() - - for write_fn in self._connection_write_fns: - write_fn((CLOSE_COMMAND, None)) - - for _, _, write_fn, _ in self._paused: - write_fn((CLOSE_COMMAND, None)) - - for process in self._workers: - process.join() - - for _, _, _, process in self._paused: - process.join() - - self._is_closed = True - - def pause_at(self, index: int) -> None: - r"""Pauses computation on this env without destroying the env. - - :param index: which env to pause. All indexes after this one will be - shifted down by one. - - This is useful for not needing to call steps on all environments when - only some are active (for example during the last episodes of running - eval episodes). - """ - if self._is_waiting: - for read_fn in self._connection_read_fns: - read_fn() - read_fn = self._connection_read_fns.pop(index) - write_fn = self._connection_write_fns.pop(index) - worker = self._workers.pop(index) - self._paused.append((index, read_fn, write_fn, worker)) - - def resume_all(self) -> None: - r"""Resumes any paused envs. - """ - for index, read_fn, write_fn, worker in reversed(self._paused): - self._connection_read_fns.insert(index, read_fn) - self._connection_write_fns.insert(index, write_fn) - self._workers.insert(index, worker) - self._paused = [] - - def call_at( - self, - index: int, - function_name: str, - function_args: Optional[Dict[str, Any]] = None, - ) -> Any: - r"""Calls a function (which is passed by name) on the selected env and - returns the result. - - :param index: which env to call the function on. - :param function_name: the name of the function to call on the env. - :param function_args: optional function args. - :return: result of calling the function. - """ - self._is_waiting = True - self._connection_write_fns[index]( - (CALL_COMMAND, (function_name, function_args)) - ) - result = self._connection_read_fns[index]() - self._is_waiting = False - return result - - def call( - self, - function_names: List[str], - function_args_list: Optional[List[Any]] = None, - ) -> List[Any]: - r"""Calls a list of functions (which are passed by name) on the - corresponding env (by index). - - :param function_names: the name of the functions to call on the envs. - :param function_args_list: list of function args for each function. If - provided, :py:`len(function_args_list)` should be as long as - :py:`len(function_names)`. - :return: result of calling the function. - """ - self._is_waiting = True - if function_args_list is None: - function_args_list = [None] * len(function_names) - assert len(function_names) == len(function_args_list) - func_args = zip(function_names, function_args_list) - for write_fn, func_args_on in zip( - self._connection_write_fns, func_args - ): - write_fn((CALL_COMMAND, func_args_on)) - results = [] - for read_fn in self._connection_read_fns: - results.append(read_fn()) - self._is_waiting = False - return results - - def render( - self, mode: str = "human", *args, **kwargs - ) -> Union[np.ndarray, None]: - r"""Render observations from all environments in a tiled image. - """ - for write_fn in self._connection_write_fns: - write_fn((RENDER_COMMAND, (args, {"mode": "rgb", **kwargs}))) - images = [read_fn() for read_fn in self._connection_read_fns] - tile = tile_images(images) - if mode == "human": - from habitat.core.utils import try_cv2_import - - cv2 = try_cv2_import() - - cv2.imshow("vecenv", tile[:, :, ::-1]) - cv2.waitKey(1) - return None - elif mode == "rgb_array": - return tile - else: - raise NotImplementedError - - def plan_act_and_preprocess(self, inputs): - self._assert_not_closed() - self._is_waiting = True - for e, write_fn in enumerate(self._connection_write_fns): - write_fn((PLAN_ACT_AND_PREPROCESS, inputs[e])) - results = [] - for read_fn in self._connection_read_fns: - results.append(read_fn()) - obs, rews, dones, infos = zip(*results) - self._is_waiting = False - return np.stack(obs), np.stack(rews), np.stack(dones), infos - - def _assert_not_closed(self): - assert not self._is_closed, "Trying to operate on a SubprocVecEnv after calling close()" - - @property - def _valid_start_methods(self) -> Set[str]: - return {"forkserver", "spawn", "fork"} - - def __del__(self): - self.close() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.close() - - -class ThreadedVectorEnv(VectorEnv): - r"""Provides same functionality as `VectorEnv`, the only difference is it - runs in a multi-thread setup inside a single process. - - `VectorEnv` runs in a multi-proc setup. This makes it much easier to debug - when using `VectorEnv` because you can actually put break points in the - environment methods. It should not be used for best performance. - """ - - def _spawn_workers( - self, - env_fn_args: Sequence[Tuple], - make_env_fn: Callable[..., Env] = _make_env_fn, - ) -> Tuple[List[Callable[[], Any]], List[Callable[[Any], None]]]: - parent_read_queues, parent_write_queues = zip( - *[(Queue(), Queue()) for _ in range(self._num_envs)] - ) - self._workers = [] - for parent_read_queue, parent_write_queue, env_args in zip( - parent_read_queues, parent_write_queues, env_fn_args - ): - thread = Thread( - target=self._worker_env, - args=( - parent_write_queue.get, - parent_read_queue.put, - make_env_fn, - env_args, - self._auto_reset_done, - ), - ) - self._workers.append(thread) - thread.daemon = True - thread.start() - return ( - [q.get for q in parent_read_queues], - [q.put for q in parent_write_queues], - ) diff --git a/main.py b/main.py deleted file mode 100755 index 437c8ad..0000000 --- a/main.py +++ /dev/null @@ -1,695 +0,0 @@ -from collections import deque, defaultdict -import os -import logging -import time -import json -import gym -import torch.nn as nn -import torch -import numpy as np - -from model import RL_Policy, Semantic_Mapping -from utils.storage import GlobalRolloutStorage -from envs import make_vec_envs -from arguments import get_args -import algo - -os.environ["OMP_NUM_THREADS"] = "1" - - -def main(): - args = get_args() - - np.random.seed(args.seed) - torch.manual_seed(args.seed) - - if args.cuda: - torch.cuda.manual_seed(args.seed) - - # Setup Logging - log_dir = "{}/models/{}/".format(args.dump_location, args.exp_name) - dump_dir = "{}/dump/{}/".format(args.dump_location, args.exp_name) - - if not os.path.exists(log_dir): - os.makedirs(log_dir) - if not os.path.exists(dump_dir): - os.makedirs(dump_dir) - - logging.basicConfig( - filename=log_dir + 'train.log', - level=logging.INFO) - print("Dumping at {}".format(log_dir)) - print(args) - logging.info(args) - - # Logging and loss variables - num_scenes = args.num_processes - num_episodes = int(args.num_eval_episodes) - device = args.device = torch.device("cuda:0" if args.cuda else "cpu") - - g_masks = torch.ones(num_scenes).float().to(device) - - best_g_reward = -np.inf - - if args.eval: - episode_success = [] - episode_spl = [] - episode_dist = [] - for _ in range(args.num_processes): - episode_success.append(deque(maxlen=num_episodes)) - episode_spl.append(deque(maxlen=num_episodes)) - episode_dist.append(deque(maxlen=num_episodes)) - - else: - episode_success = deque(maxlen=1000) - episode_spl = deque(maxlen=1000) - episode_dist = deque(maxlen=1000) - - finished = np.zeros((args.num_processes)) - wait_env = np.zeros((args.num_processes)) - - g_episode_rewards = deque(maxlen=1000) - - g_value_losses = deque(maxlen=1000) - g_action_losses = deque(maxlen=1000) - g_dist_entropies = deque(maxlen=1000) - - per_step_g_rewards = deque(maxlen=1000) - - g_process_rewards = np.zeros((num_scenes)) - - # Starting environments - torch.set_num_threads(1) - envs = make_vec_envs(args) - obs, infos = envs.reset() - - torch.set_grad_enabled(False) - - # Initialize map variables: - # Full map consists of multiple channels containing the following: - # 1. Obstacle Map - # 2. Exploread Area - # 3. Current Agent Location - # 4. Past Agent Locations - # 5,6,7,.. : Semantic Categories - nc = args.num_sem_categories + 4 # num channels - - # Calculating full and local map sizes - map_size = args.map_size_cm // args.map_resolution - full_w, full_h = map_size, map_size - local_w = int(full_w / args.global_downscaling) - local_h = int(full_h / args.global_downscaling) - - # Initializing full and local map - full_map = torch.zeros(num_scenes, nc, full_w, full_h).float().to(device) - local_map = torch.zeros(num_scenes, nc, local_w, - local_h).float().to(device) - - # Initial full and local pose - full_pose = torch.zeros(num_scenes, 3).float().to(device) - local_pose = torch.zeros(num_scenes, 3).float().to(device) - - # Origin of local map - origins = np.zeros((num_scenes, 3)) - - # Local Map Boundaries - lmb = np.zeros((num_scenes, 4)).astype(int) - - # Planner pose inputs has 7 dimensions - # 1-3 store continuous global agent location - # 4-7 store local map boundaries - planner_pose_inputs = np.zeros((num_scenes, 7)) - - def get_local_map_boundaries(agent_loc, local_sizes, full_sizes): - loc_r, loc_c = agent_loc - local_w, local_h = local_sizes - full_w, full_h = full_sizes - - if args.global_downscaling > 1: - gx1, gy1 = loc_r - local_w // 2, loc_c - local_h // 2 - gx2, gy2 = gx1 + local_w, gy1 + local_h - if gx1 < 0: - gx1, gx2 = 0, local_w - if gx2 > full_w: - gx1, gx2 = full_w - local_w, full_w - - if gy1 < 0: - gy1, gy2 = 0, local_h - if gy2 > full_h: - gy1, gy2 = full_h - local_h, full_h - else: - gx1, gx2, gy1, gy2 = 0, full_w, 0, full_h - - return [gx1, gx2, gy1, gy2] - - def init_map_and_pose(): - full_map.fill_(0.) - full_pose.fill_(0.) - full_pose[:, :2] = args.map_size_cm / 100.0 / 2.0 - - locs = full_pose.cpu().numpy() - planner_pose_inputs[:, :3] = locs - for e in range(num_scenes): - r, c = locs[e, 1], locs[e, 0] - loc_r, loc_c = [int(r * 100.0 / args.map_resolution), - int(c * 100.0 / args.map_resolution)] - - full_map[e, 2:4, loc_r - 1:loc_r + 2, loc_c - 1:loc_c + 2] = 1.0 - - lmb[e] = get_local_map_boundaries((loc_r, loc_c), - (local_w, local_h), - (full_w, full_h)) - - planner_pose_inputs[e, 3:] = lmb[e] - origins[e] = [lmb[e][2] * args.map_resolution / 100.0, - lmb[e][0] * args.map_resolution / 100.0, 0.] - - for e in range(num_scenes): - local_map[e] = full_map[e, :, - lmb[e, 0]:lmb[e, 1], - lmb[e, 2]:lmb[e, 3]] - local_pose[e] = full_pose[e] - \ - torch.from_numpy(origins[e]).to(device).float() - - def init_map_and_pose_for_env(e): - full_map[e].fill_(0.) - full_pose[e].fill_(0.) - full_pose[e, :2] = args.map_size_cm / 100.0 / 2.0 - - locs = full_pose[e].cpu().numpy() - planner_pose_inputs[e, :3] = locs - r, c = locs[1], locs[0] - loc_r, loc_c = [int(r * 100.0 / args.map_resolution), - int(c * 100.0 / args.map_resolution)] - - full_map[e, 2:4, loc_r - 1:loc_r + 2, loc_c - 1:loc_c + 2] = 1.0 - - lmb[e] = get_local_map_boundaries((loc_r, loc_c), - (local_w, local_h), - (full_w, full_h)) - - planner_pose_inputs[e, 3:] = lmb[e] - origins[e] = [lmb[e][2] * args.map_resolution / 100.0, - lmb[e][0] * args.map_resolution / 100.0, 0.] - - local_map[e] = full_map[e, :, lmb[e, 0]:lmb[e, 1], lmb[e, 2]:lmb[e, 3]] - local_pose[e] = full_pose[e] - \ - torch.from_numpy(origins[e]).to(device).float() - - def update_intrinsic_rew(e): - prev_explored_area = full_map[e, 1].sum(1).sum(0) - full_map[e, :, lmb[e, 0]:lmb[e, 1], lmb[e, 2]:lmb[e, 3]] = \ - local_map[e] - curr_explored_area = full_map[e, 1].sum(1).sum(0) - intrinsic_rews[e] = curr_explored_area - prev_explored_area - intrinsic_rews[e] *= (args.map_resolution / 100.)**2 # to m^2 - - init_map_and_pose() - - # Global policy observation space - ngc = 8 + args.num_sem_categories - es = 2 - g_observation_space = gym.spaces.Box(0, 1, - (ngc, - local_w, - local_h), dtype='uint8') - - # Global policy action space - g_action_space = gym.spaces.Box(low=0.0, high=0.99, - shape=(2,), dtype=np.float32) - - # Global policy recurrent layer size - g_hidden_size = args.global_hidden_size - - # Semantic Mapping - sem_map_module = Semantic_Mapping(args).to(device) - sem_map_module.eval() - - # Global policy - g_policy = RL_Policy(g_observation_space.shape, g_action_space, - model_type=1, - base_kwargs={'recurrent': args.use_recurrent_global, - 'hidden_size': g_hidden_size, - 'num_sem_categories': ngc - 8 - }).to(device) - g_agent = algo.PPO(g_policy, args.clip_param, args.ppo_epoch, - args.num_mini_batch, args.value_loss_coef, - args.entropy_coef, lr=args.lr, eps=args.eps, - max_grad_norm=args.max_grad_norm) - - global_input = torch.zeros(num_scenes, ngc, local_w, local_h) - global_orientation = torch.zeros(num_scenes, 1).long() - intrinsic_rews = torch.zeros(num_scenes).to(device) - extras = torch.zeros(num_scenes, 2) - - # Storage - g_rollouts = GlobalRolloutStorage(args.num_global_steps, - num_scenes, g_observation_space.shape, - g_action_space, g_policy.rec_state_size, - es).to(device) - - if args.load != "0": - print("Loading model {}".format(args.load)) - state_dict = torch.load(args.load, - map_location=lambda storage, loc: storage) - g_policy.load_state_dict(state_dict) - - if args.eval: - g_policy.eval() - - # Predict semantic map from frame 1 - poses = torch.from_numpy(np.asarray( - [infos[env_idx]['sensor_pose'] for env_idx in range(num_scenes)]) - ).float().to(device) - - _, local_map, _, local_pose = \ - sem_map_module(obs, poses, local_map, local_pose) - - # Compute Global policy input - locs = local_pose.cpu().numpy() - global_input = torch.zeros(num_scenes, ngc, local_w, local_h) - global_orientation = torch.zeros(num_scenes, 1).long() - - for e in range(num_scenes): - r, c = locs[e, 1], locs[e, 0] - loc_r, loc_c = [int(r * 100.0 / args.map_resolution), - int(c * 100.0 / args.map_resolution)] - - local_map[e, 2:4, loc_r - 1:loc_r + 2, loc_c - 1:loc_c + 2] = 1. - global_orientation[e] = int((locs[e, 2] + 180.0) / 5.) - - global_input[:, 0:4, :, :] = local_map[:, 0:4, :, :].detach() - global_input[:, 4:8, :, :] = nn.MaxPool2d(args.global_downscaling)( - full_map[:, 0:4, :, :]) - global_input[:, 8:, :, :] = local_map[:, 4:, :, :].detach() - goal_cat_id = torch.from_numpy(np.asarray( - [infos[env_idx]['goal_cat_id'] for env_idx - in range(num_scenes)])) - - extras = torch.zeros(num_scenes, 2) - extras[:, 0] = global_orientation[:, 0] - extras[:, 1] = goal_cat_id - - g_rollouts.obs[0].copy_(global_input) - g_rollouts.extras[0].copy_(extras) - - # Run Global Policy (global_goals = Long-Term Goal) - g_value, g_action, g_action_log_prob, g_rec_states = \ - g_policy.act( - g_rollouts.obs[0], - g_rollouts.rec_states[0], - g_rollouts.masks[0], - extras=g_rollouts.extras[0], - deterministic=False - ) - - cpu_actions = nn.Sigmoid()(g_action).cpu().numpy() - global_goals = [[int(action[0] * local_w), int(action[1] * local_h)] - for action in cpu_actions] - global_goals = [[min(x, int(local_w - 1)), min(y, int(local_h - 1))] - for x, y in global_goals] - - goal_maps = [np.zeros((local_w, local_h)) for _ in range(num_scenes)] - - for e in range(num_scenes): - goal_maps[e][global_goals[e][0], global_goals[e][1]] = 1 - - planner_inputs = [{} for e in range(num_scenes)] - for e, p_input in enumerate(planner_inputs): - p_input['map_pred'] = local_map[e, 0, :, :].cpu().numpy() - p_input['exp_pred'] = local_map[e, 1, :, :].cpu().numpy() - p_input['pose_pred'] = planner_pose_inputs[e] - p_input['goal'] = goal_maps[e] # global_goals[e] - p_input['new_goal'] = 1 - p_input['found_goal'] = 0 - p_input['wait'] = wait_env[e] or finished[e] - if args.visualize or args.print_images: - local_map[e, -1, :, :] = 1e-5 - p_input['sem_map_pred'] = local_map[e, 4:, :, : - ].argmax(0).cpu().numpy() - - obs, _, done, infos = envs.plan_act_and_preprocess(planner_inputs) - - start = time.time() - g_reward = 0 - - torch.set_grad_enabled(False) - spl_per_category = defaultdict(list) - success_per_category = defaultdict(list) - - for step in range(args.num_training_frames // args.num_processes + 1): - if finished.sum() == args.num_processes: - break - - g_step = (step // args.num_local_steps) % args.num_global_steps - l_step = step % args.num_local_steps - - # ------------------------------------------------------------------ - # Reinitialize variables when episode ends - l_masks = torch.FloatTensor([0 if x else 1 - for x in done]).to(device) - g_masks *= l_masks - - for e, x in enumerate(done): - if x: - spl = infos[e]['spl'] - success = infos[e]['success'] - dist = infos[e]['distance_to_goal'] - spl_per_category[infos[e]['goal_name']].append(spl) - success_per_category[infos[e]['goal_name']].append(success) - if args.eval: - episode_success[e].append(success) - episode_spl[e].append(spl) - episode_dist[e].append(dist) - if len(episode_success[e]) == num_episodes: - finished[e] = 1 - else: - episode_success.append(success) - episode_spl.append(spl) - episode_dist.append(dist) - wait_env[e] = 1. - update_intrinsic_rew(e) - init_map_and_pose_for_env(e) - # ------------------------------------------------------------------ - - # ------------------------------------------------------------------ - # Semantic Mapping Module - poses = torch.from_numpy(np.asarray( - [infos[env_idx]['sensor_pose'] for env_idx - in range(num_scenes)]) - ).float().to(device) - - _, local_map, _, local_pose = \ - sem_map_module(obs, poses, local_map, local_pose) - - locs = local_pose.cpu().numpy() - planner_pose_inputs[:, :3] = locs + origins - local_map[:, 2, :, :].fill_(0.) # Resetting current location channel - for e in range(num_scenes): - r, c = locs[e, 1], locs[e, 0] - loc_r, loc_c = [int(r * 100.0 / args.map_resolution), - int(c * 100.0 / args.map_resolution)] - local_map[e, 2:4, loc_r - 2:loc_r + 3, loc_c - 2:loc_c + 3] = 1. - - # ------------------------------------------------------------------ - - # ------------------------------------------------------------------ - # Global Policy - if l_step == args.num_local_steps - 1: - # For every global step, update the full and local maps - for e in range(num_scenes): - if wait_env[e] == 1: # New episode - wait_env[e] = 0. - else: - update_intrinsic_rew(e) - - full_map[e, :, lmb[e, 0]:lmb[e, 1], lmb[e, 2]:lmb[e, 3]] = \ - local_map[e] - full_pose[e] = local_pose[e] + \ - torch.from_numpy(origins[e]).to(device).float() - - locs = full_pose[e].cpu().numpy() - r, c = locs[1], locs[0] - loc_r, loc_c = [int(r * 100.0 / args.map_resolution), - int(c * 100.0 / args.map_resolution)] - - lmb[e] = get_local_map_boundaries((loc_r, loc_c), - (local_w, local_h), - (full_w, full_h)) - - planner_pose_inputs[e, 3:] = lmb[e] - origins[e] = [lmb[e][2] * args.map_resolution / 100.0, - lmb[e][0] * args.map_resolution / 100.0, 0.] - - local_map[e] = full_map[e, :, - lmb[e, 0]:lmb[e, 1], - lmb[e, 2]:lmb[e, 3]] - local_pose[e] = full_pose[e] - \ - torch.from_numpy(origins[e]).to(device).float() - - locs = local_pose.cpu().numpy() - for e in range(num_scenes): - global_orientation[e] = int((locs[e, 2] + 180.0) / 5.) - global_input[:, 0:4, :, :] = local_map[:, 0:4, :, :] - global_input[:, 4:8, :, :] = \ - nn.MaxPool2d(args.global_downscaling)( - full_map[:, 0:4, :, :]) - global_input[:, 8:, :, :] = local_map[:, 4:, :, :].detach() - goal_cat_id = torch.from_numpy(np.asarray( - [infos[env_idx]['goal_cat_id'] for env_idx - in range(num_scenes)])) - extras[:, 0] = global_orientation[:, 0] - extras[:, 1] = goal_cat_id - - # Get exploration reward and metrics - g_reward = torch.from_numpy(np.asarray( - [infos[env_idx]['g_reward'] for env_idx in range(num_scenes)]) - ).float().to(device) - g_reward += args.intrinsic_rew_coeff * intrinsic_rews.detach() - - g_process_rewards += g_reward.cpu().numpy() - g_total_rewards = g_process_rewards * \ - (1 - g_masks.cpu().numpy()) - g_process_rewards *= g_masks.cpu().numpy() - per_step_g_rewards.append(np.mean(g_reward.cpu().numpy())) - - if np.sum(g_total_rewards) != 0: - for total_rew in g_total_rewards: - if total_rew != 0: - g_episode_rewards.append(total_rew) - - # Add samples to global policy storage - if step == 0: - g_rollouts.obs[0].copy_(global_input) - g_rollouts.extras[0].copy_(extras) - else: - g_rollouts.insert( - global_input, g_rec_states, - g_action, g_action_log_prob, g_value, - g_reward, g_masks, extras - ) - - # Sample long-term goal from global policy - g_value, g_action, g_action_log_prob, g_rec_states = \ - g_policy.act( - g_rollouts.obs[g_step + 1], - g_rollouts.rec_states[g_step + 1], - g_rollouts.masks[g_step + 1], - extras=g_rollouts.extras[g_step + 1], - deterministic=False - ) - cpu_actions = nn.Sigmoid()(g_action).cpu().numpy() - global_goals = [[int(action[0] * local_w), - int(action[1] * local_h)] - for action in cpu_actions] - global_goals = [[min(x, int(local_w - 1)), - min(y, int(local_h - 1))] - for x, y in global_goals] - - g_reward = 0 - g_masks = torch.ones(num_scenes).float().to(device) - - # ------------------------------------------------------------------ - - # ------------------------------------------------------------------ - # Update long-term goal if target object is found - found_goal = [0 for _ in range(num_scenes)] - goal_maps = [np.zeros((local_w, local_h)) for _ in range(num_scenes)] - - for e in range(num_scenes): - goal_maps[e][global_goals[e][0], global_goals[e][1]] = 1 - - for e in range(num_scenes): - cn = infos[e]['goal_cat_id'] + 4 - if local_map[e, cn, :, :].sum() != 0.: - cat_semantic_map = local_map[e, cn, :, :].cpu().numpy() - cat_semantic_scores = cat_semantic_map - cat_semantic_scores[cat_semantic_scores > 0] = 1. - goal_maps[e] = cat_semantic_scores - found_goal[e] = 1 - # ------------------------------------------------------------------ - - # ------------------------------------------------------------------ - # Take action and get next observation - planner_inputs = [{} for e in range(num_scenes)] - for e, p_input in enumerate(planner_inputs): - p_input['map_pred'] = local_map[e, 0, :, :].cpu().numpy() - p_input['exp_pred'] = local_map[e, 1, :, :].cpu().numpy() - p_input['pose_pred'] = planner_pose_inputs[e] - p_input['goal'] = goal_maps[e] # global_goals[e] - p_input['new_goal'] = l_step == args.num_local_steps - 1 - p_input['found_goal'] = found_goal[e] - p_input['wait'] = wait_env[e] or finished[e] - if args.visualize or args.print_images: - local_map[e, -1, :, :] = 1e-5 - p_input['sem_map_pred'] = local_map[e, 4:, :, - :].argmax(0).cpu().numpy() - - obs, _, done, infos = envs.plan_act_and_preprocess(planner_inputs) - # ------------------------------------------------------------------ - - # ------------------------------------------------------------------ - # Training - torch.set_grad_enabled(True) - if g_step % args.num_global_steps == args.num_global_steps - 1 \ - and l_step == args.num_local_steps - 1: - if not args.eval: - g_next_value = g_policy.get_value( - g_rollouts.obs[-1], - g_rollouts.rec_states[-1], - g_rollouts.masks[-1], - extras=g_rollouts.extras[-1] - ).detach() - - g_rollouts.compute_returns(g_next_value, args.use_gae, - args.gamma, args.tau) - g_value_loss, g_action_loss, g_dist_entropy = \ - g_agent.update(g_rollouts) - g_value_losses.append(g_value_loss) - g_action_losses.append(g_action_loss) - g_dist_entropies.append(g_dist_entropy) - g_rollouts.after_update() - - torch.set_grad_enabled(False) - # ------------------------------------------------------------------ - - # ------------------------------------------------------------------ - # Logging - if step % args.log_interval == 0: - end = time.time() - time_elapsed = time.gmtime(end - start) - log = " ".join([ - "Time: {0:0=2d}d".format(time_elapsed.tm_mday - 1), - "{},".format(time.strftime("%Hh %Mm %Ss", time_elapsed)), - "num timesteps {},".format(step * num_scenes), - "FPS {},".format(int(step * num_scenes / (end - start))) - ]) - - log += "\n\tRewards:" - - if len(g_episode_rewards) > 0: - log += " ".join([ - " Global step mean/med rew:", - "{:.4f}/{:.4f},".format( - np.mean(per_step_g_rewards), - np.median(per_step_g_rewards)), - " Global eps mean/med/min/max eps rew:", - "{:.3f}/{:.3f}/{:.3f}/{:.3f},".format( - np.mean(g_episode_rewards), - np.median(g_episode_rewards), - np.min(g_episode_rewards), - np.max(g_episode_rewards)) - ]) - - if args.eval: - total_success = [] - total_spl = [] - total_dist = [] - for e in range(args.num_processes): - for acc in episode_success[e]: - total_success.append(acc) - for dist in episode_dist[e]: - total_dist.append(dist) - for spl in episode_spl[e]: - total_spl.append(spl) - - if len(total_spl) > 0: - log += " ObjectNav succ/spl/dtg:" - log += " {:.3f}/{:.3f}/{:.3f}({:.0f}),".format( - np.mean(total_success), - np.mean(total_spl), - np.mean(total_dist), - len(total_spl)) - else: - if len(episode_success) > 100: - log += " ObjectNav succ/spl/dtg:" - log += " {:.3f}/{:.3f}/{:.3f}({:.0f}),".format( - np.mean(episode_success), - np.mean(episode_spl), - np.mean(episode_dist), - len(episode_spl)) - - log += "\n\tLosses:" - if len(g_value_losses) > 0 and not args.eval: - log += " ".join([ - " Policy Loss value/action/dist:", - "{:.3f}/{:.3f}/{:.3f},".format( - np.mean(g_value_losses), - np.mean(g_action_losses), - np.mean(g_dist_entropies)) - ]) - - print(log) - logging.info(log) - # ------------------------------------------------------------------ - - # ------------------------------------------------------------------ - # Save best models - if (step * num_scenes) % args.save_interval < \ - num_scenes: - if len(g_episode_rewards) >= 1000 and \ - (np.mean(g_episode_rewards) >= best_g_reward) \ - and not args.eval: - torch.save(g_policy.state_dict(), - os.path.join(log_dir, "model_best.pth")) - best_g_reward = np.mean(g_episode_rewards) - - # Save periodic models - if (step * num_scenes) % args.save_periodic < \ - num_scenes: - total_steps = step * num_scenes - if not args.eval: - torch.save(g_policy.state_dict(), - os.path.join(dump_dir, - "periodic_{}.pth".format(total_steps))) - # ------------------------------------------------------------------ - - # Print and save model performance numbers during evaluation - if args.eval: - print("Dumping eval details...") - - total_success = [] - total_spl = [] - total_dist = [] - for e in range(args.num_processes): - for acc in episode_success[e]: - total_success.append(acc) - for dist in episode_dist[e]: - total_dist.append(dist) - for spl in episode_spl[e]: - total_spl.append(spl) - - if len(total_spl) > 0: - log = "Final ObjectNav succ/spl/dtg:" - log += " {:.3f}/{:.3f}/{:.3f}({:.0f}),".format( - np.mean(total_success), - np.mean(total_spl), - np.mean(total_dist), - len(total_spl)) - - print(log) - logging.info(log) - - # Save the spl per category - log = "Success | SPL per category\n" - for key in success_per_category: - log += "{}: {} | {}\n".format(key, - sum(success_per_category[key]) / - len(success_per_category[key]), - sum(spl_per_category[key]) / - len(spl_per_category[key])) - - print(log) - logging.info(log) - - with open('{}/{}_spl_per_cat_pred_thr.json'.format( - dump_dir, args.split), 'w') as f: - json.dump(spl_per_category, f) - - with open('{}/{}_success_per_cat_pred_thr.json'.format( - dump_dir, args.split), 'w') as f: - json.dump(success_per_category, f) - - -if __name__ == "__main__": - main() diff --git a/model.py b/model.py deleted file mode 100755 index c912ce0..0000000 --- a/model.py +++ /dev/null @@ -1,283 +0,0 @@ -import torch -import torch.nn as nn -from torch.nn import functional as F -import numpy as np - -from utils.distributions import Categorical, DiagGaussian -from utils.model import get_grid, ChannelPool, Flatten, NNBase -import envs.utils.depth_utils as du - - -class Goal_Oriented_Semantic_Policy(NNBase): - - def __init__(self, input_shape, recurrent=False, hidden_size=512, - num_sem_categories=16): - super(Goal_Oriented_Semantic_Policy, self).__init__( - recurrent, hidden_size, hidden_size) - - out_size = int(input_shape[1] / 16.) * int(input_shape[2] / 16.) - - self.main = nn.Sequential( - nn.MaxPool2d(2), - nn.Conv2d(num_sem_categories + 8, 32, 3, stride=1, padding=1), - nn.ReLU(), - nn.MaxPool2d(2), - nn.Conv2d(32, 64, 3, stride=1, padding=1), - nn.ReLU(), - nn.MaxPool2d(2), - nn.Conv2d(64, 128, 3, stride=1, padding=1), - nn.ReLU(), - nn.MaxPool2d(2), - nn.Conv2d(128, 64, 3, stride=1, padding=1), - nn.ReLU(), - nn.Conv2d(64, 32, 3, stride=1, padding=1), - nn.ReLU(), - Flatten() - ) - - self.linear1 = nn.Linear(out_size * 32 + 8 * 2, hidden_size) - self.linear2 = nn.Linear(hidden_size, 256) - self.critic_linear = nn.Linear(256, 1) - self.orientation_emb = nn.Embedding(72, 8) - self.goal_emb = nn.Embedding(num_sem_categories, 8) - self.train() - - def forward(self, inputs, rnn_hxs, masks, extras): - x = self.main(inputs) - orientation_emb = self.orientation_emb(extras[:, 0]) - goal_emb = self.goal_emb(extras[:, 1]) - - x = torch.cat((x, orientation_emb, goal_emb), 1) - - x = nn.ReLU()(self.linear1(x)) - if self.is_recurrent: - x, rnn_hxs = self._forward_gru(x, rnn_hxs, masks) - - x = nn.ReLU()(self.linear2(x)) - - return self.critic_linear(x).squeeze(-1), x, rnn_hxs - - -# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/model.py#L15 -class RL_Policy(nn.Module): - - def __init__(self, obs_shape, action_space, model_type=0, - base_kwargs=None): - - super(RL_Policy, self).__init__() - if base_kwargs is None: - base_kwargs = {} - - if model_type == 1: - self.network = Goal_Oriented_Semantic_Policy( - obs_shape, **base_kwargs) - else: - raise NotImplementedError - - if action_space.__class__.__name__ == "Discrete": - num_outputs = action_space.n - self.dist = Categorical(self.network.output_size, num_outputs) - elif action_space.__class__.__name__ == "Box": - num_outputs = action_space.shape[0] - self.dist = DiagGaussian(self.network.output_size, num_outputs) - else: - raise NotImplementedError - - self.model_type = model_type - - @property - def is_recurrent(self): - return self.network.is_recurrent - - @property - def rec_state_size(self): - """Size of rnn_hx.""" - return self.network.rec_state_size - - def forward(self, inputs, rnn_hxs, masks, extras): - if extras is None: - return self.network(inputs, rnn_hxs, masks) - else: - return self.network(inputs, rnn_hxs, masks, extras) - - def act(self, inputs, rnn_hxs, masks, extras=None, deterministic=False): - - value, actor_features, rnn_hxs = self(inputs, rnn_hxs, masks, extras) - dist = self.dist(actor_features) - - if deterministic: - action = dist.mode() - else: - action = dist.sample() - - action_log_probs = dist.log_probs(action) - - return value, action, action_log_probs, rnn_hxs - - def get_value(self, inputs, rnn_hxs, masks, extras=None): - value, _, _ = self(inputs, rnn_hxs, masks, extras) - return value - - def evaluate_actions(self, inputs, rnn_hxs, masks, action, extras=None): - - value, actor_features, rnn_hxs = self(inputs, rnn_hxs, masks, extras) - dist = self.dist(actor_features) - - action_log_probs = dist.log_probs(action) - dist_entropy = dist.entropy().mean() - - return value, action_log_probs, dist_entropy, rnn_hxs - - -class Semantic_Mapping(nn.Module): - - """ - Semantic_Mapping - """ - - def __init__(self, args): - super(Semantic_Mapping, self).__init__() - - self.device = args.device - self.screen_h = args.frame_height - self.screen_w = args.frame_width - self.resolution = args.map_resolution - self.z_resolution = args.map_resolution - self.map_size_cm = args.map_size_cm // args.global_downscaling - self.n_channels = 3 - self.vision_range = args.vision_range - self.dropout = 0.5 - self.fov = args.hfov - self.du_scale = args.du_scale - self.cat_pred_threshold = args.cat_pred_threshold - self.exp_pred_threshold = args.exp_pred_threshold - self.map_pred_threshold = args.map_pred_threshold - self.num_sem_categories = args.num_sem_categories - - self.max_height = int(360 / self.z_resolution) - self.min_height = int(-40 / self.z_resolution) - self.agent_height = args.camera_height * 100. - self.shift_loc = [self.vision_range * - self.resolution // 2, 0, np.pi / 2.0] - self.camera_matrix = du.get_camera_matrix( - self.screen_w, self.screen_h, self.fov) - - self.pool = ChannelPool(1) - - vr = self.vision_range - - self.init_grid = torch.zeros( - args.num_processes, 1 + self.num_sem_categories, vr, vr, - self.max_height - self.min_height - ).float().to(self.device) - self.feat = torch.ones( - args.num_processes, 1 + self.num_sem_categories, - self.screen_h // self.du_scale * self.screen_w // self.du_scale - ).float().to(self.device) - - def forward(self, obs, pose_obs, maps_last, poses_last): - bs, c, h, w = obs.size() - depth = obs[:, 3, :, :] - - point_cloud_t = du.get_point_cloud_from_z_t( - depth, self.camera_matrix, self.device, scale=self.du_scale) - - agent_view_t = du.transform_camera_view_t( - point_cloud_t, self.agent_height, 0, self.device) - - agent_view_centered_t = du.transform_pose_t( - agent_view_t, self.shift_loc, self.device) - - max_h = self.max_height - min_h = self.min_height - xy_resolution = self.resolution - z_resolution = self.z_resolution - vision_range = self.vision_range - XYZ_cm_std = agent_view_centered_t.float() - XYZ_cm_std[..., :2] = (XYZ_cm_std[..., :2] / xy_resolution) - XYZ_cm_std[..., :2] = (XYZ_cm_std[..., :2] - - vision_range // 2.) / vision_range * 2. - XYZ_cm_std[..., 2] = XYZ_cm_std[..., 2] / z_resolution - XYZ_cm_std[..., 2] = (XYZ_cm_std[..., 2] - - (max_h + min_h) // 2.) / (max_h - min_h) * 2. - self.feat[:, 1:, :] = nn.AvgPool2d(self.du_scale)( - obs[:, 4:, :, :] - ).view(bs, c - 4, h // self.du_scale * w // self.du_scale) - - XYZ_cm_std = XYZ_cm_std.permute(0, 3, 1, 2) - XYZ_cm_std = XYZ_cm_std.view(XYZ_cm_std.shape[0], - XYZ_cm_std.shape[1], - XYZ_cm_std.shape[2] * XYZ_cm_std.shape[3]) - - voxels = du.splat_feat_nd( - self.init_grid * 0., self.feat, XYZ_cm_std).transpose(2, 3) - - min_z = int(25 / z_resolution - min_h) - max_z = int((self.agent_height + 1) / z_resolution - min_h) - - agent_height_proj = voxels[..., min_z:max_z].sum(4) - all_height_proj = voxels.sum(4) - - fp_map_pred = agent_height_proj[:, 0:1, :, :] - fp_exp_pred = all_height_proj[:, 0:1, :, :] - fp_map_pred = fp_map_pred / self.map_pred_threshold - fp_exp_pred = fp_exp_pred / self.exp_pred_threshold - fp_map_pred = torch.clamp(fp_map_pred, min=0.0, max=1.0) - fp_exp_pred = torch.clamp(fp_exp_pred, min=0.0, max=1.0) - - pose_pred = poses_last - - agent_view = torch.zeros(bs, c, - self.map_size_cm // self.resolution, - self.map_size_cm // self.resolution - ).to(self.device) - - x1 = self.map_size_cm // (self.resolution * 2) - self.vision_range // 2 - x2 = x1 + self.vision_range - y1 = self.map_size_cm // (self.resolution * 2) - y2 = y1 + self.vision_range - agent_view[:, 0:1, y1:y2, x1:x2] = fp_map_pred - agent_view[:, 1:2, y1:y2, x1:x2] = fp_exp_pred - agent_view[:, 4:, y1:y2, x1:x2] = torch.clamp( - agent_height_proj[:, 1:, :, :] / self.cat_pred_threshold, - min=0.0, max=1.0) - - corrected_pose = pose_obs - - def get_new_pose_batch(pose, rel_pose_change): - - pose[:, 1] += rel_pose_change[:, 0] * \ - torch.sin(pose[:, 2] / 57.29577951308232) \ - + rel_pose_change[:, 1] * \ - torch.cos(pose[:, 2] / 57.29577951308232) - pose[:, 0] += rel_pose_change[:, 0] * \ - torch.cos(pose[:, 2] / 57.29577951308232) \ - - rel_pose_change[:, 1] * \ - torch.sin(pose[:, 2] / 57.29577951308232) - pose[:, 2] += rel_pose_change[:, 2] * 57.29577951308232 - - pose[:, 2] = torch.fmod(pose[:, 2] - 180.0, 360.0) + 180.0 - pose[:, 2] = torch.fmod(pose[:, 2] + 180.0, 360.0) - 180.0 - - return pose - - current_poses = get_new_pose_batch(poses_last, corrected_pose) - st_pose = current_poses.clone().detach() - - st_pose[:, :2] = - (st_pose[:, :2] - * 100.0 / self.resolution - - self.map_size_cm // (self.resolution * 2)) /\ - (self.map_size_cm // (self.resolution * 2)) - st_pose[:, 2] = 90. - (st_pose[:, 2]) - - rot_mat, trans_mat = get_grid(st_pose, agent_view.size(), - self.device) - - rotated = F.grid_sample(agent_view, rot_mat, align_corners=True) - translated = F.grid_sample(rotated, trans_mat, align_corners=True) - - maps2 = torch.cat((maps_last.unsqueeze(1), translated.unsqueeze(1)), 1) - - map_pred, _ = torch.max(maps2, 1) - - return fp_map_pred, map_pred, pose_pred, current_poses diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 8c8800c..0000000 --- a/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -scikit-fmm==2019.1.30 -scikit-learn==0.22.2.post1 -scikit-image==0.15.0 -numpy>=1.20.2 -ifcfg diff --git a/semantic_exploration/README.md b/semantic_exploration/README.md new file mode 100644 index 0000000..f0c4e4d --- /dev/null +++ b/semantic_exploration/README.md @@ -0,0 +1 @@ +# semantic_exploration diff --git a/semantic_exploration/agents/sem_exp.py b/semantic_exploration/agents/sem_exp.py new file mode 100644 index 0000000..5fb1ec3 --- /dev/null +++ b/semantic_exploration/agents/sem_exp.py @@ -0,0 +1,577 @@ +# -*- coding: utf-8 -*- +import math +import os + +import third_party.semantic_exploration.agents.utils.visualization as vu +import cv2 +import third_party.semantic_exploration.envs.utils.pose as pu +from third_party.semantic_exploration.envs.utils.fmm_planner import FMMPlanner +import numpy as np +import skimage.morphology +from third_party.semantic_exploration.agents.utils.detic_semantic_prediction import SemanticPredDetic +from third_party.semantic_exploration.agents.utils.owlvit_semantic_prediction import SemanticPredOwlvit +from third_party.semantic_exploration.agents.utils.semantic_prediction import SemanticPredMaskRCNN +from third_party.semantic_exploration.constants import color_palette +from PIL import Image +from torchvision import transforms + + +class Sem_Exp_Env_Agent: + """The Sem_Exp environment agent class. A seperate Sem_Exp_Env_Agent class + object is used for each environment thread. + + """ + + def __init__(self, config, rank=1): + self.config = config + # initialize transform for RGB observations + self.res = transforms.Compose( + [ + transforms.ToPILImage(), + transforms.Resize( + (self.config.FRAME_HEIGHT, self.config.FRAME_WIDTH), + interpolation=Image.NEAREST, + ), + ] + ) + + if self.config.DETECTION_MODEL == "detectron2": + self.sem_pred = SemanticPredMaskRCNN(self.config) + elif self.config.DETECTION_MODEL == "detic": + self.sem_pred = SemanticPredDetic(self.config) + elif self.config.DETECTION_MODEL == "owlvit": + self.sem_pred = SemanticPredOwlvit(self.config) + else: + raise NotImplementedError + + # initializations for planning: + self.selem = skimage.morphology.disk(self.config.OBS_DILATION_SELEM_RADIUS) + self.obs = None + self.info = None + self.obs_shape = None + self.collision_map = None + self.visited = None + self.visited_vis = None + self.col_width = None + self.curr_loc = None + self.last_loc = None + self.last_action = None + self.count_forward_actions = None + + if self.config.PLANNER == "frontier": + self.start_obs_dilation_selem_radius = self.config.OBS_DILATION_SELEM_RADIUS + self.goal_dilation_selem_radius = self.config.GOAL_DILATION_SELEM_RADIUS + self.min_obs_dilation_selem_radius = ( + self.config.MIN_OBS_DILATION_SELEM_RADIUS + ) + self.agent_cell_radius = self.config.AGENT_CELL_RADIUS + self.goal_tolerance = self.config.GOAL_TOLERANCE + self.continuous_angle_tolerance = self.config.CONTINUOUS_ANGLE_TOLERANCE + self.curr_obs_dilation_selem_radius = None + self.obs_dilation_selem = None + + if self.config.VISUALIZE: + this_dir = os.path.dirname(os.path.abspath(__file__)) + semantic_exploration_dir = os.path.join(os.path.dirname(this_dir)) + self.legend = cv2.imread(semantic_exploration_dir+"/docs/legend.png") + self.vis_image = None + self.rgb_vis = None + self.depth_vis = None + self.goal_name = None + self.timestep = 0 + self.rank = rank + self.episode_no = 0 + self.cur_stg = None + + def reset(self, obs_size, goal_name): + self.info = None + self.obs_shape = obs_size + self.goal_name = goal_name + + # Episode initializations + map_shape = ( + self.config.MAP_SIZE_CM // self.config.MAP_RESOLUTION, + self.config.MAP_SIZE_CM // self.config.MAP_RESOLUTION, + ) + self.collision_map = np.zeros(map_shape) + self.visited = np.zeros(map_shape) + self.visited_vis = np.zeros(map_shape) + self.col_width = 1 + self.count_forward_actions = 0 + self.curr_loc = [ + self.config.MAP_SIZE_CM / 100.0 / 2.0, + self.config.MAP_SIZE_CM / 100.0 / 2.0, + 0.0, + ] + self.last_action = None + + if self.config.PLANNER == "frontier": + self.curr_obs_dilation_selem_radius = self.start_obs_dilation_selem_radius + self.obs_dilation_selem = skimage.morphology.disk( + self.curr_obs_dilation_selem_radius + ) + + if self.config.VISUALIZE: + self.vis_image = vu.init_vis_image(self.goal_name, self.legend) + self.timestep = 0 + + def update_vis_image_goal(self, goal_name): + self.goal_name = goal_name + if self.config.VISUALIZE: + self.vis_image = vu.init_vis_image(self.goal_name, self.legend) + + def plan_act_and_preprocess(self, planner_inputs, info): + """Function responsible for planning, taking the action and + preprocessing observations + + Args: + planner_inputs (dict): + dict with following keys: + 'map_pred' (ndarray): (M, M) map prediction + 'goal' (ndarray): (M, M) mat denoting goal locations + 'pose_pred' (ndarray): (7,) array denoting pose (x,y,o) + and planning window (gx1, gx2, gy1, gy2) + 'found_goal' (bool): whether the goal object is found + + Returns: + obs (ndarray): preprocessed observations ((4+C) x H x W) + reward (float): amount of reward returned after previous action + done (bool): whether the episode has ended + info (dict): contains timestep, pose, goal category and + evaluation metric info + """ + + self.info = info + # plan + if planner_inputs["wait"]: + self.last_action = None + self.info["sensor_pose"] = [0.0, 0.0, 0.0] + return np.zeros(self.obs.shape), 0.0, False, self.info + + action = self._plan(planner_inputs) + + if self.config.VISUALIZE: + self._visualize(planner_inputs) + + self.timestep += 1 + + if action >= 0: + # act + action = {"action": action} + obs = self.info["state"] + self.last_action = action["action"] + self.obs = obs + self.info = info + self.info["action"] = action + + return obs, 0.0, False, info + + else: + self.last_action = None + self.info["sensor_pose"] = [0.0, 0.0, 0.0] + self.info["action"] = -1 + return np.zeros(self.obs_shape), 0.0, False, self.info + + def _reach_goal_if_in_map(self, goal_map, found_goal): + height = goal_map.shape[0] + width = goal_map.shape[1] + init_goal_map = np.zeros((height, width)) + if found_goal: + init_goal_map = goal_map + return init_goal_map + + def _explore_otherwise(self, exp_pred, goal_map, found_goal): + """Explore closest unexplored region otherwise.""" + # Select unexplored area + frontier_map = exp_pred == 0 + self.dilate_explored_kernel = skimage.morphology.disk(10) + # Dilate explored area + frontier_map = 1 - skimage.morphology.binary_dilation( + 1 - frontier_map, self.dilate_explored_kernel + ) + + self.select_border_kernel = skimage.morphology.disk(1) + # Select the frontier + frontier_map = ( + skimage.morphology.binary_dilation(frontier_map, self.select_border_kernel) + - frontier_map + ) + + if not found_goal: + goal_map = frontier_map + + return goal_map + + def _plan(self, planner_inputs): + """Function responsible for planning + + Args: + planner_inputs (dict): + dict with following keys: + 'map_pred' (ndarray): (M, M) map prediction + 'goal' (ndarray): (M, M) goal locations + 'pose_pred' (ndarray): (7,) array denoting pose (x,y,o) + and planning window (gx1, gx2, gy1, gy2) + 'found_goal' (bool): whether the goal object is found + + Returns: + action (int): action id + """ + + self.last_loc = self.curr_loc + + # Get Map prediction (obstacle) + map_pred = np.rint(planner_inputs["map_pred"]) + if self.config.PLANNER == "frontier": + goal = self._reach_goal_if_in_map( + planner_inputs["goal"], planner_inputs["found_goal"] + ) + goal = self._explore_otherwise( + planner_inputs["exp_pred"], goal, planner_inputs["found_goal"] + ) + else: + goal = planner_inputs["goal"] + + # Get pose prediction and global policy planning window + start_x, start_y, start_o, gx1, gx2, gy1, gy2 = planner_inputs["pose_pred"] + gx1, gx2, gy1, gy2 = int(gx1), int(gx2), int(gy1), int(gy2) + planning_window = [gx1, gx2, gy1, gy2] + + # Get curr loc + self.curr_loc = [start_x, start_y, start_o] + r, c = start_y, start_x + start = [ + int(r * 100.0 / self.config.MAP_RESOLUTION - gx1), + int(c * 100.0 / self.config.MAP_RESOLUTION - gy1), + ] + start = pu.threshold_poses(start, map_pred.shape) + + self.visited[gx1:gx2, gy1:gy2][ + start[0] - 0 : start[0] + 1, start[1] - 0 : start[1] + 1 + ] = 1 + + if self.config.VISUALIZE: + # Get last loc + last_start_x, last_start_y = self.last_loc[0], self.last_loc[1] + r, c = last_start_y, last_start_x + last_start = [ + int(r * 100.0 / self.config.MAP_RESOLUTION - gx1), + int(c * 100.0 / self.config.MAP_RESOLUTION - gy1), + ] + last_start = pu.threshold_poses(last_start, map_pred.shape) + self.visited_vis[gx1:gx2, gy1:gy2] = vu.draw_line( + last_start, start, self.visited_vis[gx1:gx2, gy1:gy2] + ) + + # Collision check + if self.last_action == 1: + x1, y1, t1 = self.last_loc + x2, y2, _ = self.curr_loc + buf = 4 + length = 2 + + if abs(x1 - x2) < 0.05 and abs(y1 - y2) < 0.05: + self.col_width += 2 + if self.col_width == 7: + length = 4 + buf = 3 + self.col_width = min(self.col_width, 5) + else: + self.col_width = 1 + + dist = pu.get_l2_distance(x1, x2, y1, y2) + if dist < self.config.COLLISION_THRESHOLD: # Collision + width = self.col_width + for i in range(length): + for j in range(width): + wx = x1 + 0.05 * ( + (i + buf) * np.cos(np.deg2rad(t1)) + + (j - width // 2) * np.sin(np.deg2rad(t1)) + ) + wy = y1 + 0.05 * ( + (i + buf) * np.sin(np.deg2rad(t1)) + - (j - width // 2) * np.cos(np.deg2rad(t1)) + ) + r, c = wy, wx + r, c = int(r * 100 / self.config.MAP_RESOLUTION), int( + c * 100 / self.config.MAP_RESOLUTION + ) + [r, c] = pu.threshold_poses([r, c], self.collision_map.shape) + self.collision_map[r, c] = 1 + + stg, replan, stop = self._get_stg( + map_pred, start, np.copy(goal), planning_window + ) + + # We were not able to find a path to the high-level goal + if replan and self.config.PLANNER == "frontier": + # Clean collision map + self.collision_map *= 0 + + # Reduce obstacle dilation + if self.curr_obs_dilation_selem_radius > 1: + self.curr_obs_dilation_selem_radius -= 1 + self.obs_dilation_selem = skimage.morphology.disk( + self.curr_obs_dilation_selem_radius + ) + + # Deterministic Local Policy + if stop and planner_inputs["found_goal"] == 1: + if self._get_distance_to_obstacle() <= 0.2: + action = 0 + else: + action = 1 + else: + (stg_x, stg_y) = stg + angle_st_goal = math.degrees(math.atan2(stg_x - start[0], stg_y - start[1])) + angle_agent = (start_o) % 360.0 + if angle_agent > 180: + angle_agent -= 360 + + relative_angle = (angle_agent - angle_st_goal) % 360.0 + if relative_angle > 180: + relative_angle -= 360 + + if relative_angle > self.config.TURN_ANGLE / 2.0: + # Right + action = 3 + elif relative_angle < -self.config.TURN_ANGLE / 2.0: + # Left + action = 2 + else: + # Forward + action = 1 + + self.cur_stg = stg + + return action + + def _get_stg(self, grid, start, goal, planning_window): + """Get short-term goal""" + + [gx1, gx2, gy1, gy2] = planning_window + + x1, y1, = ( + 0, + 0, + ) + x2, y2 = grid.shape + + def add_boundary(mat, value=1): + h, w = mat.shape + new_mat = np.zeros((h + 2, w + 2)) + value + new_mat[1 : h + 1, 1 : w + 1] = mat + return new_mat + + if self.config.PLANNER == "frontier": + obstacles = grid[x1:x2, y1:y2] + # Dilate obstacles + dilated_obstacles = cv2.dilate( + obstacles, self.obs_dilation_selem, iterations=1 + ) + traversible = 1 - dilated_obstacles + else: + traversible = ( + skimage.morphology.binary_dilation(grid[x1:x2, y1:y2], self.selem) + != True # noqa + ) + traversible[self.collision_map[gx1:gx2, gy1:gy2][x1:x2, y1:y2] == 1] = 0 + traversible[self.visited[gx1:gx2, gy1:gy2][x1:x2, y1:y2] == 1] = 1 + + traversible[ + int(start[0] - x1) - 1 : int(start[0] - x1) + 2, + int(start[1] - y1) - 1 : int(start[1] - y1) + 2, + ] = 1 + + traversible = add_boundary(traversible) + goal = add_boundary(goal, value=0) + + planner = FMMPlanner(traversible, step_size=self.config.PLANNER_STEP_SIZE) + # Set the goal size + selem = skimage.morphology.disk(self.config.GOAL_DILATION_SELEM_RADIUS) + goal = skimage.morphology.binary_dilation(goal, selem) != True # noqa + goal = 1 - goal * 1.0 + planner.set_multi_goal(goal) + + + if self.config.VISUALIZE: + dump_dir = "{}/dump/{}/".format(self.config.DUMP_LOCATION, self.config.EXP_NAME) + ep_dir = "{}/episodes/thread_{}/eps_{}/".format( + dump_dir, self.rank, self.episode_no + ) + if not os.path.exists(ep_dir): + os.makedirs(ep_dir) + r, c = traversible.shape + dist_vis = np.zeros((r, c * 3)) + dist_vis[:, :c] = np.flipud(traversible) + dist_vis[:, c : 2 * c] = np.flipud(goal) + dist_vis[:, 2 * c :] = np.flipud(planner.fmm_dist / planner.fmm_dist.max()) + + fn = "{}/episodes/thread_{}/eps_{}/frontier-{}-{}-Vis-{}.png".format( + dump_dir, + self.rank, + self.episode_no, + self.rank, + self.episode_no, + self.timestep, + ) + + font = cv2.FONT_HERSHEY_SIMPLEX + fontScale = 0.3 + color = (0, 0, 255) # BGR + thickness = 1 + dist_vis = cv2.cvtColor((255.0 * dist_vis).astype(np.uint8), cv2.COLOR_GRAY2BGR) + dist_vis = cv2.putText(dist_vis, "trav. (w: trav.; b: can't tarv.)", (2, 25), font, fontScale, color, thickness, cv2.LINE_AA) + dist_vis = cv2.putText(dist_vis, "goal (w: goal; b: non-goal)", (c+2,25), font, fontScale, color, thickness, cv2.LINE_AA) + dist_vis = cv2.putText(dist_vis, "trav.+goal (w: non-goal target; b: goal target)", (2*c+2,25), font, fontScale, color, thickness, cv2.LINE_AA) + cv2.imwrite(fn, dist_vis.astype(np.uint8)) + cv2.waitKey(1) + + state = [start[0] - x1 + 1, start[1] - y1 + 1] + # Add the replan flag + stg_x, stg_y, replan, stop = planner.get_short_term_goal(state) + + stg_x, stg_y = stg_x + x1 - 1, stg_y + y1 - 1 + + return (stg_x, stg_y), replan, stop + + def _preprocess_obs(self, obs, use_seg=True): + obs = obs.transpose(1, 2, 0) + rgb = obs[:, :, :3] + depth = obs[:, :, 3:4] + + sem_seg_pred = self._get_sem_pred(rgb.astype(np.uint8), use_seg=use_seg) + self.depth_vis = depth + depth = self._preprocess_depth( + depth, self.config.MIN_DEPTH, self.config.MAX_DEPTH + ) + + ds = ( + self.config.ENV_FRAME_WIDTH // self.config.FRAME_WIDTH + ) # Downscaling factor + if ds != 1: + rgb = np.asarray(self.res(rgb.astype(np.uint8))) + depth = depth[ds // 2 :: ds, ds // 2 :: ds] + sem_seg_pred = sem_seg_pred[ds // 2 :: ds, ds // 2 :: ds] + + depth = np.expand_dims(depth, axis=2) + state = np.concatenate((rgb, depth, sem_seg_pred), axis=2).transpose(2, 0, 1) + return state + + def _preprocess_depth(self, depth, min_d, max_d): + depth = depth[:, :, 0] * 1 + + for i in range(depth.shape[1]): + depth[:, i][depth[:, i] == 0.0] = depth[:, i].max() + + mask2 = depth > 0.99 + depth[mask2] = 0.0 + + mask1 = depth == 0 + depth[mask1] = 100.0 + depth = min_d * 100.0 + depth * max_d * 100.0 + return depth + + def _get_sem_pred(self, rgb, use_seg=True): + if use_seg: + semantic_pred, self.rgb_vis = self.sem_pred.get_prediction(rgb) + semantic_pred = semantic_pred.astype(np.float32) + else: + semantic_pred = np.zeros((rgb.shape[0], rgb.shape[1], 16)) + self.rgb_vis = rgb[:, :, ::-1] + return semantic_pred + + def _get_distance_to_obstacle(self): + """"Return the distance between the obstacle and the robot.""" + x1, y1, t1 = self.last_loc + x2, y2, _ = self.curr_loc + dist = pu.get_l2_distance(x1, x2, y1, y2) + return dist + + + def _visualize(self, inputs): + dump_dir = "{}/dump/{}/".format(self.config.DUMP_LOCATION, self.config.EXP_NAME) + ep_dir = "{}/episodes/thread_{}/eps_{}/".format( + dump_dir, self.rank, self.episode_no + ) + if not os.path.exists(ep_dir): + os.makedirs(ep_dir) + + map_pred = inputs["map_pred"] + exp_pred = inputs["exp_pred"] + start_x, start_y, start_o, gx1, gx2, gy1, gy2 = inputs["pose_pred"] + + goal = inputs["goal"] + goal[int(self.cur_stg[0]), int(self.cur_stg[1])] = 1 + sem_map = inputs["sem_map_pred"] + + gx1, gx2, gy1, gy2 = int(gx1), int(gx2), int(gy1), int(gy2) + + sem_map += 5 + + no_cat_mask = sem_map == self.config.NUM_SEM_CATEGORIES + 4 # 20 + map_mask = np.rint(map_pred) == 1 + exp_mask = np.rint(exp_pred) == 1 + vis_mask = self.visited_vis[gx1:gx2, gy1:gy2] == 1 + + sem_map[no_cat_mask] = 0 + m1 = np.logical_and(no_cat_mask, exp_mask) + sem_map[m1] = 2 + + m2 = np.logical_and(no_cat_mask, map_mask) + sem_map[m2] = 1 + + sem_map[vis_mask] = 3 + + selem = skimage.morphology.disk(self.goal_dilation_selem_radius) + goal_mat = 1 - skimage.morphology.binary_dilation(goal, selem) != True # noqa + + goal_mask = goal_mat == 1 + sem_map[goal_mask] = 4 + + color_pal = [int(x * 255.0) for x in color_palette] + sem_map_vis = Image.new("P", (sem_map.shape[1], sem_map.shape[0])) + sem_map_vis.putpalette(color_pal) + sem_map_vis.putdata(sem_map.flatten().astype(np.uint8)) + sem_map_vis = sem_map_vis.convert("RGB") + sem_map_vis = np.flipud(sem_map_vis) + + sem_map_vis = sem_map_vis[:, :, [2, 1, 0]] + sem_map_vis = cv2.resize( + sem_map_vis, (480, 480), interpolation=cv2.INTER_NEAREST + ) + self.depth_vis = cv2.cvtColor((255.0 * self.depth_vis).astype(np.uint8), cv2.COLOR_GRAY2BGR) + self.vis_image[ + 50 : 50 + self.config.ENV_FRAME_HEIGHT, + 15 : 15 + self.config.ENV_FRAME_WIDTH, + ] = self.rgb_vis # depth_vis or rgb_vis + self.vis_image[50:530, 670:1150] = sem_map_vis + + pos = ( + (start_x * 100.0 / self.config.MAP_RESOLUTION - gy1) + * 480 + / map_pred.shape[0], + (map_pred.shape[1] - start_y * 100.0 / self.config.MAP_RESOLUTION + gx1) + * 480 + / map_pred.shape[1], + np.deg2rad(-start_o), + ) + + agent_arrow = vu.get_contour_points(pos, origin=(670, 50)) + color = ( + int(color_palette[11] * 255), + int(color_palette[10] * 255), + int(color_palette[9] * 255), + ) + cv2.drawContours(self.vis_image, [agent_arrow], 0, color, -1) + + if self.config.VISUALIZE: + fn = "{}/episodes/thread_{}/eps_{}/{}-{}-Vis-{}.png".format( + dump_dir, + self.rank, + self.episode_no, + self.rank, + self.episode_no, + self.timestep, + ) + cv2.imwrite(fn, self.vis_image) diff --git a/semantic_exploration/agents/utils/detic_semantic_prediction.py b/semantic_exploration/agents/utils/detic_semantic_prediction.py new file mode 100644 index 0000000..e14d044 --- /dev/null +++ b/semantic_exploration/agents/utils/detic_semantic_prediction.py @@ -0,0 +1,338 @@ +# The following code is largely borrowed from +# https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py and +# https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py + +import argparse +import pathlib +import sys +import time +from pathlib import Path + +import detectron2.data.transforms as T +import numpy as np +import torch + +ROOT_DETIC = str(Path(__file__).resolve().parent).split("third_party")[0]+"third_party/" +sys.path.insert(0, ROOT_DETIC + "Detic/third_party/CenterNet2") +sys.path.insert(0, ROOT_DETIC + "Detic") +from centernet.config import add_centernet_config # noqa: E402 +from third_party.semantic_exploration.constants import coco_categories_mapping # noqa: E402 +from detectron2.checkpoint import DetectionCheckpointer # noqa: E402 +from detectron2.config import get_cfg # noqa: E402 +from detectron2.data.catalog import MetadataCatalog # noqa: E402 +from detectron2.engine.defaults import DefaultPredictor # noqa: E402 +from detectron2.modeling import build_model # noqa: E402 +from detectron2.utils.logger import setup_logger # noqa: E402 +from detectron2.utils.visualizer import ColorMode, Visualizer # noqa: E402 +from detic.config import add_detic_config # noqa: E402 +from detic.modeling.text.text_encoder import build_text_encoder # noqa: E402 +from detic.modeling.utils import reset_cls_test # noqa: E402 + +BUILDIN_CLASSIFIER = { + "lvis": ROOT_DETIC + "Detic/datasets/metadata/lvis_v1_clip_a+cname.npy", + "objects365": ROOT_DETIC + "Detic/datasets/metadata/o365_clip_a+cnamefix.npy", + "openimages": ROOT_DETIC + "Detic/datasets/metadata/oid_clip_a+cname.npy", + "coco": ROOT_DETIC + "Detic/datasets/metadata/coco_clip_a+cname.npy", +} + +BUILDIN_METADATA_PATH = { + "lvis": "lvis_v1_val", + "objects365": "objects365_v2_val", + "openimages": "oid_val_expanded", + "coco": "coco_2017_val", +} + + +class SemanticPredDetic: + def __init__(self, args): + self.segmentation_model = ImageSegmentation(args) + self.args = args + + def get_prediction(self, img): + args = self.args + image_list = [] + img = img[:, :, ::-1] + image_list.append(img) + seg_predictions, vis_output = self.segmentation_model.get_predictions( + image_list, visualize=args.visualize == 2 + ) + + if args.visualize == 2: + img = vis_output.get_image() + + semantic_input = np.zeros( + (img.shape[0], img.shape[1], 16 + 1) + ) # self.args.num_sem_categories )) #15 + 1)) + + for j, class_idx in enumerate( + seg_predictions[0]["instances"].pred_classes.cpu().numpy() + ): + if class_idx in list(coco_categories_mapping.keys()): + idx = coco_categories_mapping[class_idx] + obj_mask = seg_predictions[0]["instances"].pred_masks[j] * 1.0 + semantic_input[:, :, idx] += obj_mask.cpu().numpy() + # The shape of the semantic input is (480, 640, 17) + return semantic_input, img + + +def compress_sem_map(sem_map): + c_map = np.zeros((sem_map.shape[1], sem_map.shape[2])) + for i in range(sem_map.shape[0]): + c_map[sem_map[i] > 0.0] = i + 1 + return c_map + + +class ImageSegmentation: + def __init__(self, args): + string_args = """ + --config-file {} + --input input1.jpeg + --vocabulary coco + --confidence-threshold {} + --opts MODEL.WEIGHTS {} + """.format( + ROOT_DETIC + "/Detic/configs/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml", + args.sem_pred_prob_thr, + ROOT_DETIC + "/Detic/configs/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth" + ) + + if args.sem_gpu_id == -2: + string_args += """ MODEL.DEVICE cpu""" + else: + string_args += """ MODEL.DEVICE cuda:{}""".format(args.sem_gpu_id) + + string_args = string_args.split() + + args = get_seg_parser().parse_args(string_args) + logger = setup_logger() + logger.info("Arguments: " + str(args)) + + cfg = setup_cfg(args) + + assert args.vocabulary in ["coco", "custom"] + if args.vocabulary == "custom": + raise NotImplementedError + elif args.vocabulary == "coco": + self.metadata = MetadataCatalog.get(BUILDIN_METADATA_PATH[args.vocabulary]) + classifier = BUILDIN_CLASSIFIER[args.vocabulary] + self.categories_mapping = { + 56: 0, # chair + 57: 1, # couch + 58: 2, # plant + 59: 3, # bed + 61: 4, # toilet + 62: 5, # tv + 60: 6, # table + 69: 7, # oven + 71: 8, # sink + 72: 9, # refrigerator + 73: 10, # book + 74: 11, # clock + 75: 12, # vase + 41: 13, # cup + 39: 14, # bottle + } + + self.num_sem_categories = len(self.categories_mapping) + num_classes = len(self.metadata.thing_classes) + self.instance_mode = ColorMode.IMAGE + self.demo = VisualizationDemo(cfg, classifier, num_classes) + + def get_predictions(self, img, visualize=0): + return self.demo.run_on_image(img, visualize=visualize) + + +def setup_cfg(args): + cfg = get_cfg() + # We forcefully use cpu here + cfg.MODEL.DEVICE = "cpu" + add_centernet_config(cfg) + add_detic_config(cfg) + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + # Set score_threshold for builtin models + cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold + cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold + cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = ( + args.confidence_threshold + ) + cfg.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_PATH = "rand" # load later + cfg.MODEL.ROI_HEADS.ONE_CLASS_PER_PROPOSAL = True + cfg.MODEL.ROI_BOX_HEAD.CAT_FREQ_PATH = ( + ROOT_DETIC + "Detic/" + cfg.MODEL.ROI_BOX_HEAD.CAT_FREQ_PATH + ) + # Fix cfg paths given we're not running from the Detic folder + cfg.MODEL.TEST_CLASSIFIERS[0] = ( + ROOT_DETIC + "Detic/" + cfg.MODEL.TEST_CLASSIFIERS[0] + ) + cfg.MODEL.TEST_CLASSIFIERS[1] = ( + ROOT_DETIC + "Detic/" + cfg.MODEL.TEST_CLASSIFIERS[1] + ) + cfg.freeze() + return cfg + + +class VisualizationDemo(object): + def __init__(self, cfg, classifier, num_classes, instance_mode=ColorMode.IMAGE): + """ + Args: + cfg (CfgNode): + instance_mode (ColorMode): + """ + self.metadata = MetadataCatalog.get( + cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" + ) + self.cpu_device = torch.device("cpu") + self.instance_mode = instance_mode + self.predictor = BatchPredictor(cfg) + + if type(classifier) == pathlib.PosixPath: + classifier = str(classifier) + reset_cls_test(self.predictor.model, classifier, num_classes) + + def run_on_image(self, image_list, visualize=0): + """ + Args: + image (np.ndarray): an image of shape (H, W, C) (in BGR order). + This is the format used by OpenCV. + + Returns: + predictions (dict): the output of the model. + vis_output (VisImage): the visualized image output. + """ + vis_output = None + all_predictions = self.predictor(image_list) + + # Convert image from OpenCV BGR format to Matplotlib RGB format. + if visualize: + predictions = all_predictions[0] + image = image_list[0] + visualizer = Visualizer( + image, self.metadata, instance_mode=self.instance_mode + ) + if "panoptic_seg" in predictions: + panoptic_seg, segments_info = predictions["panoptic_seg"] + vis_output = visualizer.draw_panoptic_seg_predictions( + panoptic_seg.to(self.cpu_device), segments_info + ) + else: + if "sem_seg" in predictions: + vis_output = visualizer.draw_sem_seg( + predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) + ) + if "instances" in predictions: + instances = predictions["instances"].to(self.cpu_device) + vis_output = visualizer.draw_instance_predictions( + predictions=instances + ) + + return all_predictions, vis_output + + +def get_seg_parser(): + parser = argparse.ArgumentParser(description="Detectron2 demo for builtin models") + parser.add_argument( + "--config-file", + default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml", + metavar="FILE", + help="path to config file", + ) + parser.add_argument( + "--webcam", action="store_true", help="Take inputs from webcam." + ) + parser.add_argument("--video-input", help="Path to video file.") + parser.add_argument( + "--input", nargs="+", help="A list of space separated input images" + ) + parser.add_argument( + "--output", + help="A file or directory to save output visualizations. " + "If not given, will show output in an OpenCV window.", + ) + parser.add_argument( + "--vocabulary", + default="lvis", + choices=["lvis", "openimages", "objects365", "coco", "custom"], + help="", + ) + parser.add_argument( + "--custom_vocabulary", + default="", + help="", + ) + parser.add_argument( + "--confidence-threshold", + type=float, + default=0.1, + help="Minimum score for instance predictions to be shown", + ) + parser.add_argument( + "--opts", + help="Modify config options using the command-line 'KEY VALUE' pairs", + default=[], + nargs=argparse.REMAINDER, + ) + return parser + + +class BatchPredictor: + """ + Create a simple end-to-end predictor with the given config that runs on + single device for a list of input images. + + Compared to using the model directly, this class does the following + additions: + + 1. Load checkpoint from `cfg.MODEL.WEIGHTS`. + 2. Always take BGR image as the input and apply conversion defined by + `cfg.INPUT.FORMAT`. + 3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`. + 4. Take a list of input images + + Attributes: + metadata (Metadata): the metadata of the underlying dataset, obtained + from cfg.DATASETS.TEST. + + """ + + def __init__(self, cfg): + self.cfg = cfg.clone() # cfg can be modified by model + self.model = build_model(self.cfg) + self.model.eval() + self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0]) + + checkpointer = DetectionCheckpointer(self.model) + checkpointer.load(cfg.MODEL.WEIGHTS) + + self.input_format = cfg.INPUT.FORMAT + assert self.input_format in ["RGB", "BGR"], self.input_format + + def __call__(self, image_list): + """ + Args: + image_list (list of np.ndarray): a list of images of + shape (H, W, C) (in BGR order). + + Returns: + predictions (dict): + the output of the model for all images. + See :doc:`/tutorials/models` for details about the format. + """ + inputs = [] + for original_image in image_list: + # https://github.com/sphinx-doc/sphinx/issues/4258 + # Apply pre-processing to image. + if self.input_format == "RGB": + # whether the model expects BGR inputs or RGB + original_image = original_image[:, :, ::-1] + height, width = original_image.shape[:2] + image = original_image + image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) + + instance = {"image": image, "height": height, "width": width} + + inputs.append(instance) + + with torch.no_grad(): + predictions = self.model(inputs) + return predictions diff --git a/semantic_exploration/agents/utils/fmm_planner.py b/semantic_exploration/agents/utils/fmm_planner.py new file mode 100644 index 0000000..5256498 --- /dev/null +++ b/semantic_exploration/agents/utils/fmm_planner.py @@ -0,0 +1,153 @@ +import cv2 +import numpy as np +import skfmm +import skimage +from numpy import ma + + +def get_mask(sx, sy, scale, step_size): + size = int(step_size // scale) * 2 + 1 + mask = np.zeros((size, size)) + for i in range(size): + for j in range(size): + if ((i + 0.5) - (size // 2 + sx)) ** 2 + ( + (j + 0.5) - (size // 2 + sy) + ) ** 2 <= step_size**2 and ((i + 0.5) - (size // 2 + sx)) ** 2 + ( + (j + 0.5) - (size // 2 + sy) + ) ** 2 > ( + step_size - 1 + ) ** 2: + mask[i, j] = 1 + + mask[size // 2, size // 2] = 1 + return mask + + +def get_dist(sx, sy, scale, step_size): + size = int(step_size // scale) * 2 + 1 + mask = np.zeros((size, size)) + 1e-10 + for i in range(size): + for j in range(size): + if ((i + 0.5) - (size // 2 + sx)) ** 2 + ( + (j + 0.5) - (size // 2 + sy) + ) ** 2 <= step_size**2: + mask[i, j] = max( + 5, + ( + ((i + 0.5) - (size // 2 + sx)) ** 2 + + ((j + 0.5) - (size // 2 + sy)) ** 2 + ) + ** 0.5, + ) + return mask + + +class FMMPlanner: + def __init__(self, traversible, scale=1, step_size=5): + self.scale = scale + self.step_size = step_size + if scale != 1.0: + self.traversible = cv2.resize( + traversible, + (traversible.shape[1] // scale, traversible.shape[0] // scale), + interpolation=cv2.INTER_NEAREST, + ) + self.traversible = np.rint(self.traversible) + else: + self.traversible = traversible + + self.du = int(self.step_size / (self.scale * 1.0)) + self.fmm_dist = None + + def set_goal(self, goal, auto_improve=False): + traversible_ma = ma.masked_values(self.traversible * 1, 0) + goal_x, goal_y = int(goal[0] / (self.scale * 1.0)), int( + goal[1] / (self.scale * 1.0) + ) + + if self.traversible[goal_x, goal_y] == 0.0 and auto_improve: + goal_x, goal_y = self._find_nearest_goal([goal_x, goal_y]) + + traversible_ma[goal_x, goal_y] = 0 + dd = skfmm.distance(traversible_ma, dx=1) + dd = ma.filled(dd, np.max(dd) + 1) + self.fmm_dist = dd + return + + def set_multi_goal(self, goal_map): + traversible_ma = ma.masked_values(self.traversible * 1, 0) + traversible_ma[goal_map == 1] = 0 + dd = skfmm.distance(traversible_ma, dx=1) + dd = ma.filled(dd, np.max(dd) + 1) + self.fmm_dist = dd + return + + def get_short_term_goal(self, state): + scale = self.scale * 1.0 + state = [x / scale for x in state] + dx, dy = state[0] - int(state[0]), state[1] - int(state[1]) + mask = get_mask(dx, dy, scale, self.step_size) + dist_mask = get_dist(dx, dy, scale, self.step_size) + + state = [int(x) for x in state] + + dist = np.pad( + self.fmm_dist, + self.du, + "constant", + constant_values=self.fmm_dist.shape[0] ** 2, + ) + subset = dist[ + state[0] : state[0] + 2 * self.du + 1, state[1] : state[1] + 2 * self.du + 1 + ] + + assert ( + subset.shape[0] == 2 * self.du + 1 and subset.shape[1] == 2 * self.du + 1 + ), "Planning error: unexpected subset shape {}".format(subset.shape) + + subset *= mask + subset += (1 - mask) * self.fmm_dist.shape[0] ** 2 + + if subset[self.du, self.du] < self.step_size: # < 0.25 * 100 / 5.: # 25cm + stop = True + else: + stop = False + + subset -= subset[self.du, self.du] + ratio1 = subset / dist_mask + subset[ratio1 < -1.5] = 1 + + # Find the smallest number index + (stg_x, stg_y) = np.unravel_index(np.argmin(subset), subset.shape) + + if subset[stg_x, stg_y] > -0.0001: + replan = True + else: + replan = False + + return ( + (stg_x + state[0] - self.du) * scale, + (stg_y + state[1] - self.du) * scale, + replan, + stop, + ) + + def _find_nearest_goal(self, goal): + traversible = ( + skimage.morphology.binary_dilation( + np.zeros(self.traversible.shape), skimage.morphology.disk(2) + ) + != True # noqa + ) + traversible = traversible * 1.0 + planner = FMMPlanner(traversible) + planner.set_goal(goal) + + mask = self.traversible + + dist_map = planner.fmm_dist * mask + dist_map[dist_map == 0] = dist_map.max() + + goal = np.unravel_index(dist_map.argmin(), dist_map.shape) + + return goal diff --git a/semantic_exploration/agents/utils/owlvit_semantic_prediction.py b/semantic_exploration/agents/utils/owlvit_semantic_prediction.py new file mode 100644 index 0000000..f40614b --- /dev/null +++ b/semantic_exploration/agents/utils/owlvit_semantic_prediction.py @@ -0,0 +1,107 @@ +# The following code is largely borrowed from +# https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py and +# https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py + +import sys +from pathlib import Path +ROOT_DETIC = str(Path(__file__).resolve().parent).split("third_party")[0]+"third_party/" +sys.path.insert(0, ROOT_DETIC + "Detic/third_party/CenterNet2") +sys.path.insert(0, ROOT_DETIC + "Detic") + +import argparse # noqa: E402 +import pathlib # noqa: E402 +import time # noqa: E402 +from pathlib import Path # noqa: E402 + +import cv2 # noqa: E402 +import numpy as np # noqa: E402 +import torch # noqa: E402 +from third_party.semantic_exploration.constants import coco_categories, coco_categories_mapping # noqa: E402 +from PIL import Image # noqa: E402 +from transformers import OwlViTForObjectDetection, OwlViTProcessor # noqa: E402 + + +class SemanticPredOwlvit: + def __init__(self, config): + self.config = config + # Get the device + self.device = ( + torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + ) + # Get the owlvit model + self.model = OwlViTForObjectDetection.from_pretrained( + "google/owlvit-base-patch32" + ) + self.model.eval() + self.model.to(self.device) + # Define the prefix + self.prefix="an image of " + # Get the pretrained model + self.processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32") + # Get the meta info + labels = [] + for _key in coco_categories: + labels.append(self.prefix+_key) + self.labels = [labels] + self.score_threshold = 0.15 + + def get_prediction(self, img): + img = img[:, :, ::-1] + # Process inputs + inputs = self.processor(text=self.labels, images=img, return_tensors="pt") + target_sizes = torch.Tensor([img.shape[:2]]) + + # Inference + with torch.no_grad(): + outputs = self.model(**inputs) + + # Convert outputs (bounding boxes and class logits) to COCO API + results = self.processor.post_process( + outputs=outputs, target_sizes=target_sizes + ) + + # Process the image + img_i = 0 + boxes, scores, labels = ( + results[img_i]["boxes"], + results[img_i]["scores"], + results[img_i]["labels"], + ) + semantic_input = np.zeros((img.shape[0], img.shape[1], 16 + 1)) + for box, score, label in zip(boxes, scores, labels): + # Get the location of the bounding box + if score >= self.score_threshold: + top_left_x, top_left_y, bottom_right_x, bottom_right_y = [ + int(round(i, 0)) for i in box.tolist() + ] + semantic_input[ + top_left_x:bottom_right_x, top_left_y:bottom_right_y, int(label) + ] = 1 + if self.config.VISUALIZE is True and score >= self.score_threshold: + # Use this line code to add bounding box to the image + img = np.ascontiguousarray(img, dtype=np.uint8) + cv2.rectangle( + img, + (top_left_x, top_left_y), + (bottom_right_x, bottom_right_y), + (0, 0, 255), + 2, + ) + cv2.putText( + img, + self.labels[0][int(label)], + (top_left_x, top_left_y - 10), + cv2.FONT_HERSHEY_SIMPLEX, + 0.9, + (0, 0, 255), + 2, + ) + + return semantic_input, img + + +def compress_sem_map(sem_map): + c_map = np.zeros((sem_map.shape[1], sem_map.shape[2])) + for i in range(sem_map.shape[0]): + c_map[sem_map[i] > 0.0] = i + 1 + return c_map diff --git a/agents/utils/semantic_prediction.py b/semantic_exploration/agents/utils/semantic_prediction.py similarity index 84% rename from agents/utils/semantic_prediction.py rename to semantic_exploration/agents/utils/semantic_prediction.py index 3ce9675..3b70ee0 100644 --- a/agents/utils/semantic_prediction.py +++ b/semantic_exploration/agents/utils/semantic_prediction.py @@ -4,23 +4,20 @@ import argparse import time - -import torch +from pathlib import Path +import detectron2.data.transforms as T import numpy as np - +import torch +from third_party.semantic_exploration.constants import coco_categories_mapping +from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg -from detectron2.utils.logger import setup_logger from detectron2.data.catalog import MetadataCatalog from detectron2.modeling import build_model -from detectron2.checkpoint import DetectionCheckpointer +from detectron2.utils.logger import setup_logger from detectron2.utils.visualizer import ColorMode, Visualizer -import detectron2.data.transforms as T -from constants import coco_categories_mapping - - -class SemanticPredMaskRCNN(): +class SemanticPredMaskRCNN: def __init__(self, args): self.segmentation_model = ImageSegmentation(args) self.args = args @@ -31,39 +28,48 @@ def get_prediction(self, img): img = img[:, :, ::-1] image_list.append(img) seg_predictions, vis_output = self.segmentation_model.get_predictions( - image_list, visualize=args.visualize == 2) + image_list, visualize=args.visualize == 2 + ) if args.visualize == 2: img = vis_output.get_image() - semantic_input = np.zeros((img.shape[0], img.shape[1], 15 + 1)) + semantic_input = np.zeros( + (img.shape[0], img.shape[1], 16 + 1) + ) # self.args.num_sem_categories )) #15 + 1)) for j, class_idx in enumerate( - seg_predictions[0]['instances'].pred_classes.cpu().numpy()): + seg_predictions[0]["instances"].pred_classes.cpu().numpy() + ): if class_idx in list(coco_categories_mapping.keys()): idx = coco_categories_mapping[class_idx] - obj_mask = seg_predictions[0]['instances'].pred_masks[j] * 1. + obj_mask = seg_predictions[0]["instances"].pred_masks[j] * 1.0 semantic_input[:, :, idx] += obj_mask.cpu().numpy() - + # The shape of the semantic input is (480, 640, 17) return semantic_input, img def compress_sem_map(sem_map): c_map = np.zeros((sem_map.shape[1], sem_map.shape[2])) for i in range(sem_map.shape[0]): - c_map[sem_map[i] > 0.] = i + 1 + c_map[sem_map[i] > 0.0] = i + 1 return c_map -class ImageSegmentation(): +class ImageSegmentation: def __init__(self, args): + ROOT = str(Path(__file__).resolve().parent).split("third_party")[0]+"third_party/" + model_path = ROOT + "detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml" string_args = """ - --config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml + --config-file {} --input input1.jpeg --confidence-threshold {} --opts MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl - """.format(args.sem_pred_prob_thr) + """.format( + model_path, + args.sem_pred_prob_thr + ) if args.sem_gpu_id == -2: string_args += """ MODEL.DEVICE cpu""" @@ -71,7 +77,6 @@ def __init__(self, args): string_args += """ MODEL.DEVICE cuda:{}""".format(args.sem_gpu_id) string_args = string_args.split() - args = get_seg_parser().parse_args(string_args) logger = setup_logger() logger.info("Arguments: " + str(args)) @@ -91,15 +96,15 @@ def setup_cfg(args): # Set score_threshold for builtin models cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold - cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = \ + cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = ( args.confidence_threshold + ) cfg.freeze() return cfg def get_seg_parser(): - parser = argparse.ArgumentParser( - description="Detectron2 demo for builtin models") + parser = argparse.ArgumentParser(description="Detectron2 demo for builtin models") parser.add_argument( "--config-file", default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml", @@ -107,14 +112,12 @@ def get_seg_parser(): help="path to config file", ) parser.add_argument( - "--webcam", - action="store_true", - help="Take inputs from webcam.") + "--webcam", action="store_true", help="Take inputs from webcam." + ) parser.add_argument("--video-input", help="Path to video file.") parser.add_argument( - "--input", - nargs="+", - help="A list of space separated input images") + "--input", nargs="+", help="A list of space separated input images" + ) parser.add_argument( "--output", help="A file or directory to save output visualizations. " @@ -124,7 +127,7 @@ def get_seg_parser(): parser.add_argument( "--confidence-threshold", type=float, - default=0.5, + default=0.1, help="Minimum score for instance predictions to be shown", ) parser.add_argument( @@ -169,7 +172,8 @@ def run_on_image(self, image_list, visualize=0): predictions = all_predictions[0] image = image_list[0] visualizer = Visualizer( - image, self.metadata, instance_mode=self.instance_mode) + image, self.metadata, instance_mode=self.instance_mode + ) if "panoptic_seg" in predictions: panoptic_seg, segments_info = predictions["panoptic_seg"] vis_output = visualizer.draw_panoptic_seg_predictions( @@ -178,13 +182,13 @@ def run_on_image(self, image_list, visualize=0): else: if "sem_seg" in predictions: vis_output = visualizer.draw_sem_seg( - predictions["sem_seg"].argmax( - dim=0).to(self.cpu_device) + predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) ) if "instances" in predictions: instances = predictions["instances"].to(self.cpu_device) vis_output = visualizer.draw_instance_predictions( - predictions=instances) + predictions=instances + ) return all_predictions, vis_output diff --git a/agents/utils/visualization.py b/semantic_exploration/agents/utils/visualization.py similarity index 61% rename from agents/utils/visualization.py rename to semantic_exploration/agents/utils/visualization.py index 16b3d40..a269bc5 100644 --- a/agents/utils/visualization.py +++ b/semantic_exploration/agents/utils/visualization.py @@ -4,14 +4,16 @@ def get_contour_points(pos, origin, size=20): x, y, o = pos - pt1 = (int(x) + origin[0], - int(y) + origin[1]) - pt2 = (int(x + size / 1.5 * np.cos(o + np.pi * 4 / 3)) + origin[0], - int(y + size / 1.5 * np.sin(o + np.pi * 4 / 3)) + origin[1]) - pt3 = (int(x + size * np.cos(o)) + origin[0], - int(y + size * np.sin(o)) + origin[1]) - pt4 = (int(x + size / 1.5 * np.cos(o - np.pi * 4 / 3)) + origin[0], - int(y + size / 1.5 * np.sin(o - np.pi * 4 / 3)) + origin[1]) + pt1 = (int(x) + origin[0], int(y) + origin[1]) + pt2 = ( + int(x + size / 1.5 * np.cos(o + np.pi * 4 / 3)) + origin[0], + int(y + size / 1.5 * np.sin(o + np.pi * 4 / 3)) + origin[1], + ) + pt3 = (int(x + size * np.cos(o)) + origin[0], int(y + size * np.sin(o)) + origin[1]) + pt4 = ( + int(x + size / 1.5 * np.cos(o - np.pi * 4 / 3)) + origin[0], + int(y + size / 1.5 * np.sin(o - np.pi * 4 / 3)) + origin[1], + ) return np.array([pt1, pt2, pt3, pt4]) @@ -20,7 +22,7 @@ def draw_line(start, end, mat, steps=25, w=1): for i in range(steps + 1): x = int(np.rint(start[0] + (end[0] - start[0]) * i / steps)) y = int(np.rint(start[1] + (end[1] - start[1]) * i / steps)) - mat[x - w:x + w, y - w:y + w] = 1 + mat[x - w : x + w, y - w : y + w] = 1 return mat @@ -35,17 +37,17 @@ def init_vis_image(goal_name, legend): textsize = cv2.getTextSize(text, font, fontScale, thickness)[0] textX = (640 - textsize[0]) // 2 + 15 textY = (50 + textsize[1]) // 2 - vis_image = cv2.putText(vis_image, text, (textX, textY), - font, fontScale, color, thickness, - cv2.LINE_AA) + vis_image = cv2.putText( + vis_image, text, (textX, textY), font, fontScale, color, thickness, cv2.LINE_AA + ) text = "Predicted Semantic Map" textsize = cv2.getTextSize(text, font, fontScale, thickness)[0] textX = 640 + (480 - textsize[0]) // 2 + 30 textY = (50 + textsize[1]) // 2 - vis_image = cv2.putText(vis_image, text, (textX, textY), - font, fontScale, color, thickness, - cv2.LINE_AA) + vis_image = cv2.putText( + vis_image, text, (textX, textY), font, fontScale, color, thickness, cv2.LINE_AA + ) # draw outlines color = [100, 100, 100] @@ -60,6 +62,6 @@ def init_vis_image(goal_name, legend): # draw legend lx, ly, _ = legend.shape - vis_image[537:537 + lx, 155:155 + ly, :] = legend + vis_image[537 : 537 + lx, 155 : 155 + ly, :] = legend return vis_image diff --git a/semantic_exploration/constants.py b/semantic_exploration/constants.py new file mode 100644 index 0000000..ac92b95 --- /dev/null +++ b/semantic_exploration/constants.py @@ -0,0 +1,155 @@ +scenes = {} +scenes["train"] = [ + "Allensville", + "Beechwood", + "Benevolence", + "Coffeen", + "Cosmos", + "Forkland", + "Hanson", + "Hiteman", + "Klickitat", + "Lakeville", + "Leonardo", + "Lindenwood", + "Marstons", + "Merom", + "Mifflinburg", + "Newfields", + "Onaga", + "Pinesdale", + "Pomaria", + "Ranchester", + "Shelbyville", + "Stockman", + "Tolstoy", + "Wainscott", + "Woodbine", +] + +scenes["val"] = [ + "Collierville", + "Corozal", + "Darden", + "Markleeville", + "Wiconisco", +] + +coco_categories = { + "chair": 0, + "couch": 1, + "potted plant": 2, + "bed": 3, + "toilet": 4, + "tv": 5, + "dining-table": 6, + "oven": 7, + "sink": 8, + "refrigerator": 9, + "book": 10, + "clock": 11, + "vase": 12, + "cup": 13, + "bottle": 14, +} + +coco_categories_replica = { + "chair": 0, + "sofa": 1, + "plant": 2, + "bed": 3, + "toilet": 4, + "tv": 5, + "table": 6, + "oven": 7, + "sink": 8, + "fridge": 9, + "book": 10, + "clock": 11, + "vase": 12, + "cup": 13, + "bottle": 14, + "person": 15, +} + +coco_categories_mapping = { + 56: 0, # chair + 57: 1, # couch + 58: 2, # potted plant + 59: 3, # bed + 61: 4, # toilet + 62: 5, # tv + 60: 6, # dining-table + 69: 7, # oven + 71: 8, # sink + 72: 9, # refrigerator + 73: 10, # book + 74: 11, # clock + 75: 12, # vase + 41: 13, # cup + 39: 14, # bottle + 0: 15, # person +} + +color_palette = [ + 1.0, + 1.0, + 1.0, + 0.6, + 0.6, + 0.6, + 0.95, + 0.95, + 0.95, + 0.96, + 0.36, + 0.26, + 0.12156862745098039, + 0.47058823529411764, + 0.7058823529411765, + 0.9400000000000001, + 0.7818, + 0.66, + 0.9400000000000001, + 0.8868, + 0.66, + 0.8882000000000001, + 0.9400000000000001, + 0.66, + 0.7832000000000001, + 0.9400000000000001, + 0.66, + 0.6782000000000001, + 0.9400000000000001, + 0.66, + 0.66, + 0.9400000000000001, + 0.7468000000000001, + 0.66, + 0.9400000000000001, + 0.8518000000000001, + 0.66, + 0.9232, + 0.9400000000000001, + 0.66, + 0.8182, + 0.9400000000000001, + 0.66, + 0.7132, + 0.9400000000000001, + 0.7117999999999999, + 0.66, + 0.9400000000000001, + 0.8168, + 0.66, + 0.9400000000000001, + 0.9218, + 0.66, + 0.9400000000000001, + 0.9400000000000001, + 0.66, + 0.8531999999999998, + 0.9400000000000001, + 0.66, + 0.748199999999999, +] diff --git a/docs/DOCKER_INSTRUCTIONS.md b/semantic_exploration/docs/DOCKER_INSTRUCTIONS.md similarity index 100% rename from docs/DOCKER_INSTRUCTIONS.md rename to semantic_exploration/docs/DOCKER_INSTRUCTIONS.md diff --git a/docs/INSTRUCTIONS.md b/semantic_exploration/docs/INSTRUCTIONS.md similarity index 100% rename from docs/INSTRUCTIONS.md rename to semantic_exploration/docs/INSTRUCTIONS.md diff --git a/docs/example.gif b/semantic_exploration/docs/example.gif similarity index 100% rename from docs/example.gif rename to semantic_exploration/docs/example.gif diff --git a/docs/legend.png b/semantic_exploration/docs/legend.png similarity index 100% rename from docs/legend.png rename to semantic_exploration/docs/legend.png diff --git a/docs/overview.jpg b/semantic_exploration/docs/overview.jpg similarity index 100% rename from docs/overview.jpg rename to semantic_exploration/docs/overview.jpg diff --git a/envs/__init__.py b/semantic_exploration/envs/__init__.py similarity index 80% rename from envs/__init__.py rename to semantic_exploration/envs/__init__.py index 2098b62..9ea7d02 100755 --- a/envs/__init__.py +++ b/semantic_exploration/envs/__init__.py @@ -1,11 +1,9 @@ import torch -from .habitat import construct_envs - -def make_vec_envs(args): - envs = construct_envs(args) - envs = VecPyTorch(envs, args.device) +def make_vec_envs(args, is_slurm=False, is_eval=False): + envs, num_envs = construct_envs(args, is_slurm, is_eval) + envs = VecPyTorch(envs, num_envs, args.device) return envs @@ -13,11 +11,11 @@ def make_vec_envs(args): # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/envs.py#L159 class VecPyTorch(): - def __init__(self, venv, device): + def __init__(self, venv, num_envs, device): self.venv = venv - self.num_envs = venv.num_envs - self.observation_space = venv.observation_space - self.action_space = venv.action_space + self.num_envs = num_envs + # self.observation_space = venv.observation_space + # self.action_space = venv.action_space self.device = device def reset(self): diff --git a/envs/utils/depth_utils.py b/semantic_exploration/envs/utils/depth_utils.py similarity index 99% rename from envs/utils/depth_utils.py rename to semantic_exploration/envs/utils/depth_utils.py index afe98e2..a6c430a 100644 --- a/envs/utils/depth_utils.py +++ b/semantic_exploration/envs/utils/depth_utils.py @@ -21,7 +21,7 @@ import numpy as np import torch -import envs.utils.rotation_utils as ru +import third_party.semantic_exploration.envs.utils.rotation_utils as ru def get_camera_matrix(width, height, fov): diff --git a/envs/utils/fmm_planner.py b/semantic_exploration/envs/utils/fmm_planner.py similarity index 90% rename from envs/utils/fmm_planner.py rename to semantic_exploration/envs/utils/fmm_planner.py index c2fd0bd..82bce12 100644 --- a/envs/utils/fmm_planner.py +++ b/semantic_exploration/envs/utils/fmm_planner.py @@ -37,7 +37,15 @@ def get_dist(sx, sy, scale, step_size): class FMMPlanner(): - def __init__(self, traversible, scale=1, step_size=5): + def __init__(self, traversible, scale=1, step_size=25): + """ + Arguments: + traversible: (M + 1, M + 1) binary map encoding traversible regions + scale: map scale + step_size: maximum distance of the short-term goal selected by the + planner + vis_dir: folder where to dump visualization + """ self.scale = scale self.step_size = step_size if scale != 1.: @@ -95,7 +103,7 @@ def get_short_term_goal(self, state): subset *= mask subset += (1 - mask) * self.fmm_dist.shape[0] ** 2 - if subset[self.du, self.du] < 0.25 * 100 / 5.: # 25cm + if subset[self.du, self.du] < self.step_size: #< 0.25 * 100 / 5.: # 25cm stop = True else: stop = False @@ -104,6 +112,7 @@ def get_short_term_goal(self, state): ratio1 = subset / dist_mask subset[ratio1 < -1.5] = 1 + # Find the smallest number index (stg_x, stg_y) = np.unravel_index(np.argmin(subset), subset.shape) if subset[stg_x, stg_y] > -0.0001: diff --git a/envs/utils/map_builder.py b/semantic_exploration/envs/utils/map_builder.py similarity index 100% rename from envs/utils/map_builder.py rename to semantic_exploration/envs/utils/map_builder.py diff --git a/envs/utils/pose.py b/semantic_exploration/envs/utils/pose.py similarity index 100% rename from envs/utils/pose.py rename to semantic_exploration/envs/utils/pose.py diff --git a/envs/utils/rotation_utils.py b/semantic_exploration/envs/utils/rotation_utils.py similarity index 100% rename from envs/utils/rotation_utils.py rename to semantic_exploration/envs/utils/rotation_utils.py diff --git a/semantic_exploration/models/__init__.py b/semantic_exploration/models/__init__.py new file mode 100644 index 0000000..e50ca4e --- /dev/null +++ b/semantic_exploration/models/__init__.py @@ -0,0 +1,3 @@ +from third_party.semantic_exploration.models.owlvit import OwlVit +from third_party.semantic_exploration.models.semantic_map import Semantic_Mapping +from third_party.semantic_exploration.models.sentence_similarity import SentenceSimilarity diff --git a/semantic_exploration/models/owlvit.py b/semantic_exploration/models/owlvit.py new file mode 100644 index 0000000..7b4e939 --- /dev/null +++ b/semantic_exploration/models/owlvit.py @@ -0,0 +1,241 @@ +# mypy: ignore-errors +import argparse +import time + +import cv2 +import torch +from PIL import Image +from transformers import OwlViTForObjectDetection, OwlViTProcessor + + +class OwlVit: + def __init__(self, labels, score_threshold, show_img): + # self.device = torch.device('cpu') + self.device = ( + torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + ) + + self.model = OwlViTForObjectDetection.from_pretrained( + "google/owlvit-base-patch32" + ) + self.model.eval() + self.model.to(self.device) + + self.processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32") + + self.labels = labels + self.score_threshold = score_threshold + self.show_img = show_img + + def run_inference(self, img): + """ + img: an open cv image in (H, W, C) format + """ + # Process inputs + # img = img.to(self.device) + inputs = self.processor(text=self.labels, images=img, return_tensors="pt") + + # Target image sizes (height, width) to rescale box predictions [batch_size, 2] + # target_sizes = torch.Tensor([img.size[::-1]]) this is for PIL images + target_sizes = torch.Tensor([img.shape[:2]]).to(self.device) + inputs = inputs.to(self.device) + + # Inference + with torch.no_grad(): + outputs = self.model(**inputs) + + # Convert outputs (bounding boxes and class logits) to COCO API + results = self.processor.post_process( + outputs=outputs, target_sizes=target_sizes + ) + # img = img.to('cpu') + + if self.show_img: + self.show_img_with_overlaid_bounding_boxes(img, results) + + return self.get_most_confident_bounding_box_per_label(results) + + def run_inference_and_return_img(self, img): + """ + img: an open cv image in (H, W, C) format + """ + # img = img.to(self.device) + + inputs = self.processor(text=self.labels, images=img, return_tensors="pt") + target_sizes = torch.Tensor([img.shape[:2]]).to(self.device) + inputs = inputs.to(self.device) + # Inference + with torch.no_grad(): + outputs = self.model(**inputs) + + # Convert outputs (bounding boxes and class logits) to COCO API + results = self.processor.post_process( + outputs=outputs, target_sizes=target_sizes + ) + # img = img.to('cpu') + # if self.show_img: + # self.show_img_with_overlaid_bounding_boxes(img, results) + + return self.get_most_confident_bounding_box_per_label( + results + ), self.create_img_with_bounding_box(img, results) + + def show_img_with_overlaid_bounding_boxes(self, img, results): + img = self.create_img_with_bounding_box(img, results) + cv2.imshow("img", img) + cv2.waitKey(1) + + def get_bounding_boxes(self, results): + """ + Returns all bounding boxes with a score above the threshold + """ + boxes, scores, labels = ( + results[0]["boxes"], + results[0]["scores"], + results[0]["labels"], + ) + boxes = boxes.to("cpu") + labels = labels.to("cpu") + scores = scores.to("cpu") + + target_boxes = [] + for box, score, label in zip(boxes, scores, labels): + box = [round(i, 2) for i in box.tolist()] + if score >= self.score_threshold: + target_boxes.append([self.labels[0][label.item()], score.item(), box]) + + return target_boxes + + def get_most_confident_bounding_box(self, results): + """ + Returns the most confident bounding box + """ + boxes, scores, labels = ( + results[0]["boxes"], + results[0]["scores"], + results[0]["labels"], + ) + boxes = boxes.to("cpu") + labels = labels.to("cpu") + scores = scores.to("cpu") + + target_box = [] + target_score = -float("inf") + + for box, score, label in zip(boxes, scores, labels): + box = [round(i, 2) for i in box.tolist()] + if score >= self.score_threshold: + if score > target_score: + target_score = score + target_box = box + + if target_score == -float("inf"): + return None + else: + x1 = int(target_box[0]) + y1 = int(target_box[1]) + x2 = int(target_box[2]) + y2 = int(target_box[3]) + + print("location:", x1, y1, x2, y2) + return x1, y1, x2, y2 + + def get_most_confident_bounding_box_per_label(self, results): + """ + Returns the most confident bounding box for each label above the threshold + """ + boxes, scores, labels = ( + results[0]["boxes"], + results[0]["scores"], + results[0]["labels"], + ) + boxes = boxes.to("cpu") + labels = labels.to("cpu") + scores = scores.to("cpu") + + # Initialize dictionaries to store most confident bounding boxes and scores per label + target_boxes = {} + target_scores = {} + + for box, score, label in zip(boxes, scores, labels): + box = [round(i, 2) for i in box.tolist()] + if score >= self.score_threshold: + # If the current score is higher than the stored score for this label, update the target box and score + if ( + label.item() not in target_scores + or score > target_scores[label.item()] + ): + target_scores[label.item()] = score.item() + target_boxes[label.item()] = box + + # Format the output + result = [] + for label, box in target_boxes.items(): + x1 = int(box[0]) + y1 = int(box[1]) + x2 = int(box[2]) + y2 = int(box[3]) + + result.append( + [self.labels[0][label], target_scores[label], [x1, y1, x2, y2]] + ) + + return result + + def create_img_with_bounding_box(self, img, results): + """ + Returns an image with all bounding boxes avove the threshold overlaid + """ + + results = self.get_most_confident_bounding_box_per_label(results) + font = cv2.FONT_HERSHEY_SIMPLEX + + for label, score, box in results: + img = cv2.rectangle(img, box[:2], box[2:], (255, 0, 0), 5) + if box[3] + 25 > 768: + y = box[3] - 10 + else: + y = box[3] + 25 + img = cv2.putText( + img, label, (box[0], y), font, 1, (255, 0, 0), 2, cv2.LINE_AA + ) + + return img + + def update_label(self, labels): + self.labels = labels + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--file", + type=str, + default="/home/akshara/spot/spot_rl_experiments/spot_rl/grasp_visualizations/1650841878.2699108.png", + ) + parser.add_argument("--score_threshold", type=float, default=0.1) + parser.add_argument("--show_img", type=bool, default=True) + parser.add_argument( + "--labels", + type=list, + default=[ + [ + "lion plush", + "penguin plush", + "teddy bear", + "bear plush", + "caterpilar plush", + "ball plush", + "rubiks cube", + ] + ], + ) + args = parser.parse_args() + + file = args.file + img = cv2.imread(file) + + V = OwlVit(args.labels, args.score_threshold, args.show_img) + results = V.run_inference(img) + # Keep the window open for 10 seconds + time.sleep(10) diff --git a/semantic_exploration/models/semantic_map.py b/semantic_exploration/models/semantic_map.py new file mode 100644 index 0000000..1b866d0 --- /dev/null +++ b/semantic_exploration/models/semantic_map.py @@ -0,0 +1,240 @@ +import numpy as np +import third_party.semantic_exploration.envs.utils.depth_utils as du +import torch +import torch.nn as nn +from torch.nn import functional as F + + +class ChannelPool(nn.MaxPool1d): + def forward(self, x): + n, c, w, h = x.size() + x = x.view(n, c, w * h).permute(0, 2, 1) + x = x.contiguous() + pooled = F.max_pool1d(x, c, 1) + _, _, c = pooled.size() + pooled = pooled.permute(0, 2, 1) + return pooled.view(n, c, w, h) + + +class Semantic_Mapping(nn.Module): + + """ + Semantic_Mapping + """ + + def __init__(self, config): + super(Semantic_Mapping, self).__init__() + + self.device = config.DEVICE + self.screen_h = config.FRAME_HEIGHT + self.screen_w = config.FRAME_WIDTH + self.resolution = config.MAP_RESOLUTION + self.z_resolution = config.MAP_RESOLUTION + self.map_size_cm = config.MAP_SIZE_CM // config.GLOBAL_DOWNSCALING + self.n_channels = 3 + self.vision_range = config.VISION_RANGE + self.dropout = 0.5 + self.fov = config.HFOV + self.du_scale = config.DU_SCALE + self.cat_pred_threshold = config.CAT_PRED_THRESHOLD + self.exp_pred_threshold = config.EXP_PRED_THRESHOLD + self.map_pred_threshold = config.MAP_PRED_THRESHOLD + self.num_sem_categories = config.NUM_SEM_CATEGORIES + + self.max_height = int(180 / self.z_resolution) + self.min_height = int(-40 / self.z_resolution) + self.agent_height = config.CAMERA_HEIGHT * 100.0 + self.shift_loc = [self.vision_range * self.resolution // 2, 0, np.pi / 2.0] + self.camera_matrix = du.get_camera_matrix( + self.screen_w, self.screen_h, self.fov + ) + + self.pool = ChannelPool(1) + + vr = self.vision_range + + self.init_grid = ( + torch.zeros( + config.NUM_PROCESSES, + 1 + self.num_sem_categories, + vr, + vr, + self.max_height - self.min_height, + ) + .float() + .to(self.device) + ) + self.feat = ( + torch.ones( + config.NUM_PROCESSES, + 1 + self.num_sem_categories, + self.screen_h // self.du_scale * self.screen_w // self.du_scale, + ) + .float() + .to(self.device) + ) + + def forward(self, obs, pose_obs, maps_last, poses_last): + bs, c, h, w = obs.size() + depth = obs[:, 3, :, :] + point_cloud_t = du.get_point_cloud_from_z_t( + depth, self.camera_matrix, self.device, scale=self.du_scale + ) + + agent_view_t = du.transform_camera_view_t( + point_cloud_t, self.agent_height, 0, self.device + ) + + agent_view_centered_t = du.transform_pose_t( + agent_view_t, self.shift_loc, self.device + ) + + max_h = self.max_height + min_h = self.min_height + xy_resolution = self.resolution + z_resolution = self.z_resolution + vision_range = self.vision_range + XYZ_cm_std = agent_view_centered_t.float() + XYZ_cm_std[..., :2] = XYZ_cm_std[..., :2] / xy_resolution + XYZ_cm_std[..., :2] = ( + (XYZ_cm_std[..., :2] - vision_range // 2.0) / vision_range * 2.0 + ) + XYZ_cm_std[..., 2] = XYZ_cm_std[..., 2] / z_resolution + XYZ_cm_std[..., 2] = ( + (XYZ_cm_std[..., 2] - (max_h + min_h) // 2.0) / (max_h - min_h) * 2.0 + ) + self.feat[:, 1:, :] = nn.AvgPool2d(self.du_scale)(obs[:, 4:, :, :]).view( + bs, c - 4, h // self.du_scale * w // self.du_scale + ) + + XYZ_cm_std = XYZ_cm_std.permute(0, 3, 1, 2) + XYZ_cm_std = XYZ_cm_std.view( + XYZ_cm_std.shape[0], + XYZ_cm_std.shape[1], + XYZ_cm_std.shape[2] * XYZ_cm_std.shape[3], + ) + + voxels = du.splat_feat_nd( + self.init_grid * 0.0, self.feat, XYZ_cm_std + ).transpose(2, 3) + + min_z = int(25 / z_resolution - min_h) + max_z = int((self.agent_height + 1) / z_resolution - min_h) + + agent_height_proj = voxels[..., min_z:max_z].sum(4) + all_height_proj = voxels.sum(4) + + fp_map_pred = agent_height_proj[:, 0:1, :, :] + fp_exp_pred = all_height_proj[:, 0:1, :, :] + fp_map_pred = fp_map_pred / self.map_pred_threshold + fp_exp_pred = fp_exp_pred / self.exp_pred_threshold + fp_map_pred = torch.clamp(fp_map_pred, min=0.0, max=1.0) + fp_exp_pred = torch.clamp(fp_exp_pred, min=0.0, max=1.0) + + pose_pred = poses_last + + agent_view = torch.zeros( + bs, + c, + self.map_size_cm // self.resolution, + self.map_size_cm // self.resolution, + ).to(self.device) + + x1 = self.map_size_cm // (self.resolution * 2) - self.vision_range // 2 + x2 = x1 + self.vision_range + y1 = self.map_size_cm // (self.resolution * 2) + y2 = y1 + self.vision_range + + agent_view[:, 0:1, y1:y2, x1:x2] = fp_map_pred + agent_view[:, 1:2, y1:y2, x1:x2] = fp_exp_pred + agent_view[:, 4:, y1:y2, x1:x2] = torch.clamp( + agent_height_proj[:, 1:, :, :] / self.cat_pred_threshold, min=0.0, max=1.0 + ) + + corrected_pose = pose_obs + + def get_new_pose_batch(pose, rel_pose_change): + + pose[:, 1] += rel_pose_change[:, 0] * torch.sin( + pose[:, 2] / 57.29577951308232 + ) + rel_pose_change[:, 1] * torch.cos(pose[:, 2] / 57.29577951308232) + pose[:, 0] += rel_pose_change[:, 0] * torch.cos( + pose[:, 2] / 57.29577951308232 + ) - rel_pose_change[:, 1] * torch.sin(pose[:, 2] / 57.29577951308232) + pose[:, 2] += rel_pose_change[:, 2] * 57.29577951308232 + + pose[:, 2] = torch.fmod(pose[:, 2] - 180.0, 360.0) + 180.0 + pose[:, 2] = torch.fmod(pose[:, 2] + 180.0, 360.0) - 180.0 + + return pose + + current_poses = get_new_pose_batch(poses_last, corrected_pose) + st_pose = current_poses.clone().detach() + + st_pose[:, :2] = -( + st_pose[:, :2] * 100.0 / self.resolution + - self.map_size_cm // (self.resolution * 2) + ) / (self.map_size_cm // (self.resolution * 2)) + st_pose[:, 2] = 90.0 - (st_pose[:, 2]) + + rot_mat, trans_mat = self.get_grid(st_pose, agent_view.size(), self.device) + + rotated = F.grid_sample(agent_view, rot_mat, align_corners=True) + translated = F.grid_sample(rotated, trans_mat, align_corners=True) + + # Remove people in the last map if found new people + if translated[:, 19, :, :].sum() > 0.99: + maps_last[:, 19, :, :] = 0 + + maps2 = torch.cat((maps_last.unsqueeze(1), translated.unsqueeze(1)), 1) + + map_pred, _ = torch.max(maps2, 1) + + if np.sum(np.array(map_pred)[0, 1, :, :]) == 0: + import pdb + + pdb.set_trace() + + return fp_map_pred, map_pred, pose_pred, current_poses + + @staticmethod + def get_grid(pose, grid_size, device): + """ + Input: + `pose` FloatTensor(bs, 3) + `grid_size` 4-tuple (bs, _, grid_h, grid_w) + `device` torch.device (cpu or gpu) + Output: + `rot_grid` FloatTensor(bs, grid_h, grid_w, 2) + `trans_grid` FloatTensor(bs, grid_h, grid_w, 2) + + """ + pose = pose.float() + x = pose[:, 0] + y = pose[:, 1] + t = pose[:, 2] + + t = t * np.pi / 180.0 + cos_t = t.cos() + sin_t = t.sin() + + theta11 = torch.stack( + [cos_t, -sin_t, torch.zeros(cos_t.shape).float().to(device)], 1 + ) + theta12 = torch.stack( + [sin_t, cos_t, torch.zeros(cos_t.shape).float().to(device)], 1 + ) + theta1 = torch.stack([theta11, theta12], 1) + + theta21 = torch.stack( + [torch.ones(x.shape).to(device), -torch.zeros(x.shape).to(device), x], 1 + ) + theta22 = torch.stack( + [torch.zeros(x.shape).to(device), torch.ones(x.shape).to(device), y], 1 + ) + theta2 = torch.stack([theta21, theta22], 1) + + rot_grid = F.affine_grid(theta1, torch.Size(grid_size)) + trans_grid = F.affine_grid(theta2, torch.Size(grid_size)) + + return rot_grid, trans_grid diff --git a/semantic_exploration/models/sentence_similarity.py b/semantic_exploration/models/sentence_similarity.py new file mode 100644 index 0000000..f574a63 --- /dev/null +++ b/semantic_exploration/models/sentence_similarity.py @@ -0,0 +1,73 @@ +import torch +import torch.nn.functional as F +from transformers import AutoModel, AutoTokenizer + + +class SentenceSimilarity: + def __init__(self): + # Load model from HuggingFace Hub + self.tokenizer = AutoTokenizer.from_pretrained( + "sentence-transformers/all-MiniLM-L6-v2" + ) + self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") + + def mean_pooling(self, model_output, attention_mask): + # Mean Pooling - Take attention mask into account for correct averaging + + token_embeddings = model_output[ + 0 + ] # First element of model_output contains all token embeddings + input_mask_expanded = ( + attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + ) + return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( + input_mask_expanded.sum(1), min=1e-9 + ) + + def get_similarity_two_sentences(self, a, b): + sentences = [a, b] + + # Tokenize sentences + encoded_input = self.tokenizer( + sentences, padding=True, truncation=True, return_tensors="pt" + ) + + # Compute token embeddings + with torch.no_grad(): + model_output = self.model(**encoded_input) + + # Perform pooling + sentence_embeddings = self.mean_pooling( + model_output, encoded_input["attention_mask"] + ) + + # Normalize embeddings + sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) + + # compute cosine similarity between embeddings + cosine_scores = sentence_embeddings[0] @ sentence_embeddings[1].T + return cosine_scores + + def get_most_similar_in_list(self, query_word, list): + sentences = [query_word] + [word.replace("_", " ") for word in list] + encoded_input = self.tokenizer( + sentences, padding=True, truncation=True, return_tensors="pt" + ) + with torch.no_grad(): + model_output = self.model(**encoded_input) + + # Perform pooling + sentence_embeddings = self.mean_pooling( + model_output, encoded_input["attention_mask"] + ) + + # Normalize embeddings + sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) + + # compute cosine similarity between embeddings + cosine_scores = sentence_embeddings[0] @ sentence_embeddings[1:].T + print( + f"word queried : {query_word} | word list : {list} | cosine scores : {cosine_scores}" + ) + + return list[torch.argmax(cosine_scores).item()] diff --git a/test.py b/test.py deleted file mode 100644 index 7a81a77..0000000 --- a/test.py +++ /dev/null @@ -1,39 +0,0 @@ -import os -import torch -import numpy as np - -from envs import make_vec_envs -from arguments import get_args - -os.environ["OMP_NUM_THREADS"] = "1" - -args = get_args() - -np.random.seed(args.seed) -torch.manual_seed(args.seed) - -if args.cuda: - torch.cuda.manual_seed(args.seed) - - -def main(): - num_episodes = int(args.num_eval_episodes) - args.device = torch.device("cuda:0" if args.cuda else "cpu") - - torch.set_num_threads(1) - envs = make_vec_envs(args) - obs, infos = envs.reset() - - for ep_num in range(num_episodes): - for step in range(args.max_episode_length): - action = torch.randint(0, 3, (args.num_processes,)) - obs, rew, done, infos = envs.step(action) - - if done: - break - - print("Test successfully completed") - - -if __name__ == "__main__": - main() diff --git a/utils/distributions.py b/utils/distributions.py deleted file mode 100644 index cd025eb..0000000 --- a/utils/distributions.py +++ /dev/null @@ -1,61 +0,0 @@ -# The following code is largely borrowed from: -# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/distributions.py - -import torch -import torch.nn as nn - -from utils.model import AddBias - -""" -Modify standard PyTorch distributions so they are compatible with this code. -""" - -FixedCategorical = torch.distributions.Categorical - -old_sample = FixedCategorical.sample -FixedCategorical.sample = lambda self: old_sample(self) - -log_prob_cat = FixedCategorical.log_prob -FixedCategorical.log_probs = lambda self, actions: \ - log_prob_cat(self, actions.squeeze(-1)) -FixedCategorical.mode = lambda self: self.probs.argmax(dim=1, keepdim=True) - -FixedNormal = torch.distributions.Normal -log_prob_normal = FixedNormal.log_prob -FixedNormal.log_probs = lambda self, actions: \ - log_prob_normal(self, actions).sum(-1, keepdim=False) - -entropy = FixedNormal.entropy -FixedNormal.entropy = lambda self: entropy(self).sum(-1) - -FixedNormal.mode = lambda self: self.mean - - -class Categorical(nn.Module): - - def __init__(self, num_inputs, num_outputs): - super(Categorical, self).__init__() - self.linear = nn.Linear(num_inputs, num_outputs) - - def forward(self, x): - x = self.linear(x) - return FixedCategorical(logits=x) - - -class DiagGaussian(nn.Module): - - def __init__(self, num_inputs, num_outputs): - super(DiagGaussian, self).__init__() - - self.fc_mean = nn.Linear(num_inputs, num_outputs) - self.logstd = AddBias(torch.zeros(num_outputs)) - - def forward(self, x): - action_mean = self.fc_mean(x) - - zeros = torch.zeros(action_mean.size()) - if x.is_cuda: - zeros = zeros.cuda() - - action_logstd = self.logstd(zeros) - return FixedNormal(action_mean, action_logstd.exp()) diff --git a/utils/model.py b/utils/model.py deleted file mode 100644 index e55b045..0000000 --- a/utils/model.py +++ /dev/null @@ -1,132 +0,0 @@ -import numpy as np -import torch -from torch import nn -from torch.nn import functional as F - - -def get_grid(pose, grid_size, device): - """ - Input: - `pose` FloatTensor(bs, 3) - `grid_size` 4-tuple (bs, _, grid_h, grid_w) - `device` torch.device (cpu or gpu) - Output: - `rot_grid` FloatTensor(bs, grid_h, grid_w, 2) - `trans_grid` FloatTensor(bs, grid_h, grid_w, 2) - - """ - pose = pose.float() - x = pose[:, 0] - y = pose[:, 1] - t = pose[:, 2] - - bs = x.size(0) - t = t * np.pi / 180. - cos_t = t.cos() - sin_t = t.sin() - - theta11 = torch.stack([cos_t, -sin_t, - torch.zeros(cos_t.shape).float().to(device)], 1) - theta12 = torch.stack([sin_t, cos_t, - torch.zeros(cos_t.shape).float().to(device)], 1) - theta1 = torch.stack([theta11, theta12], 1) - - theta21 = torch.stack([torch.ones(x.shape).to(device), - -torch.zeros(x.shape).to(device), x], 1) - theta22 = torch.stack([torch.zeros(x.shape).to(device), - torch.ones(x.shape).to(device), y], 1) - theta2 = torch.stack([theta21, theta22], 1) - - rot_grid = F.affine_grid(theta1, torch.Size(grid_size)) - trans_grid = F.affine_grid(theta2, torch.Size(grid_size)) - - return rot_grid, trans_grid - - -class ChannelPool(nn.MaxPool1d): - def forward(self, x): - n, c, w, h = x.size() - x = x.view(n, c, w * h).permute(0, 2, 1) - x = x.contiguous() - pooled = F.max_pool1d(x, c, 1) - _, _, c = pooled.size() - pooled = pooled.permute(0, 2, 1) - return pooled.view(n, c, w, h) - - -# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/utils.py#L32 -class AddBias(nn.Module): - def __init__(self, bias): - super(AddBias, self).__init__() - self._bias = nn.Parameter(bias.unsqueeze(1)) - - def forward(self, x): - if x.dim() == 2: - bias = self._bias.t().view(1, -1) - else: - bias = self._bias.t().view(1, -1, 1, 1) - - return x + bias - - -# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/model.py#L10 -class Flatten(nn.Module): - def forward(self, x): - return x.view(x.size(0), -1) - - -# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/model.py#L82 -class NNBase(nn.Module): - - def __init__(self, recurrent, recurrent_input_size, hidden_size): - - super(NNBase, self).__init__() - self._hidden_size = hidden_size - self._recurrent = recurrent - - if recurrent: - self.gru = nn.GRUCell(recurrent_input_size, hidden_size) - nn.init.orthogonal_(self.gru.weight_ih.data) - nn.init.orthogonal_(self.gru.weight_hh.data) - self.gru.bias_ih.data.fill_(0) - self.gru.bias_hh.data.fill_(0) - - @property - def is_recurrent(self): - return self._recurrent - - @property - def rec_state_size(self): - if self._recurrent: - return self._hidden_size - return 1 - - @property - def output_size(self): - return self._hidden_size - - def _forward_gru(self, x, hxs, masks): - if x.size(0) == hxs.size(0): - x = hxs = self.gru(x, hxs * masks[:, None]) - else: - # x is a (T, N, -1) tensor that has been flatten to (T * N, -1) - N = hxs.size(0) - T = int(x.size(0) / N) - - # unflatten - x = x.view(T, N, x.size(1)) - - # Same deal with masks - masks = masks.view(T, N, 1) - - outputs = [] - for i in range(T): - hx = hxs = self.gru(x[i], hxs * masks[i]) - outputs.append(hx) - - # x is a (T, N, -1) tensor - x = torch.stack(outputs, dim=0) - # flatten - x = x.view(T * N, -1) - - return x, hxs diff --git a/utils/optimization.py b/utils/optimization.py deleted file mode 100644 index 7f4050b..0000000 --- a/utils/optimization.py +++ /dev/null @@ -1,59 +0,0 @@ -import inspect -import re - -from torch import optim - - -def get_optimizer(parameters, s): - """ - Parse optimizer parameters. - Input should be of the form: - - "sgd,lr=0.01" - - "adagrad,lr=0.1,lr_decay=0.05" - """ - if "," in s: - method = s[:s.find(',')] - optim_params = {} - for x in s[s.find(',') + 1:].split(','): - split = x.split('=') - assert len(split) == 2 - assert re.match( - r"^[+-]?(\d+(\.\d*)?|\.\d+)$", - split[1]) is not None - optim_params[split[0]] = float(split[1]) - else: - method = s - optim_params = {} - - if method == 'adadelta': - optim_fn = optim.Adadelta - elif method == 'adagrad': - optim_fn = optim.Adagrad - elif method == 'adam': - optim_fn = optim.Adam - optim_params['betas'] = (optim_params.get('beta1', 0.5), - optim_params.get('beta2', 0.999)) - optim_params.pop('beta1', None) - optim_params.pop('beta2', None) - elif method == 'adamax': - optim_fn = optim.Adamax - elif method == 'asgd': - optim_fn = optim.ASGD - elif method == 'rmsprop': - optim_fn = optim.RMSprop - elif method == 'rprop': - optim_fn = optim.Rprop - elif method == 'sgd': - optim_fn = optim.SGD - assert 'lr' in optim_params - else: - raise Exception('Unknown optimization method: "%s"' % method) - - # check that we give good parameters to the optimizer - expected_args = inspect.getargspec(optim_fn.__init__)[0] - assert expected_args[:2] == ['self', 'params'] - if not all(k in expected_args[2:] for k in optim_params.keys()): - raise Exception('Unexpected parameters: expected "%s", got "%s"' % ( - str(expected_args[2:]), str(optim_params.keys()))) - - return optim_fn(parameters, **optim_params) diff --git a/utils/storage.py b/utils/storage.py deleted file mode 100644 index e71cac3..0000000 --- a/utils/storage.py +++ /dev/null @@ -1,203 +0,0 @@ -# The following code is largely borrowed from: -# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/storage.py - -from collections import namedtuple - -import numpy as np -import torch -from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler - - -def _flatten_helper(T, N, _tensor): - return _tensor.view(T * N, *_tensor.size()[2:]) - - -class RolloutStorage(object): - - def __init__(self, num_steps, num_processes, obs_shape, action_space, - rec_state_size): - - if action_space.__class__.__name__ == 'Discrete': - self.n_actions = 1 - action_type = torch.long - else: - self.n_actions = action_space.shape[0] - action_type = torch.float32 - - self.obs = torch.zeros(num_steps + 1, num_processes, *obs_shape) - self.rec_states = torch.zeros(num_steps + 1, num_processes, - rec_state_size) - self.rewards = torch.zeros(num_steps, num_processes) - self.value_preds = torch.zeros(num_steps + 1, num_processes) - self.returns = torch.zeros(num_steps + 1, num_processes) - self.action_log_probs = torch.zeros(num_steps, num_processes) - self.actions = torch.zeros((num_steps, num_processes, self.n_actions), - dtype=action_type) - self.masks = torch.ones(num_steps + 1, num_processes) - - self.num_steps = num_steps - self.step = 0 - self.has_extras = False - self.extras_size = None - - def to(self, device): - self.obs = self.obs.to(device) - self.rec_states = self.rec_states.to(device) - self.rewards = self.rewards.to(device) - self.value_preds = self.value_preds.to(device) - self.returns = self.returns.to(device) - self.action_log_probs = self.action_log_probs.to(device) - self.actions = self.actions.to(device) - self.masks = self.masks.to(device) - if self.has_extras: - self.extras = self.extras.to(device) - return self - - def insert(self, obs, rec_states, actions, action_log_probs, value_preds, - rewards, masks): - self.obs[self.step + 1].copy_(obs) - self.rec_states[self.step + 1].copy_(rec_states) - self.actions[self.step].copy_(actions.view(-1, self.n_actions)) - self.action_log_probs[self.step].copy_(action_log_probs) - self.value_preds[self.step].copy_(value_preds) - self.rewards[self.step].copy_(rewards) - self.masks[self.step + 1].copy_(masks) - - self.step = (self.step + 1) % self.num_steps - - def after_update(self): - self.obs[0].copy_(self.obs[-1]) - self.rec_states[0].copy_(self.rec_states[-1]) - self.masks[0].copy_(self.masks[-1]) - if self.has_extras: - self.extras[0].copy_(self.extras[-1]) - - def compute_returns(self, next_value, use_gae, gamma, tau): - if use_gae: - self.value_preds[-1] = next_value - gae = 0 - for step in reversed(range(self.rewards.size(0))): - delta = self.rewards[step] + gamma \ - * self.value_preds[step + 1] * self.masks[step + 1] \ - - self.value_preds[step] - gae = delta + gamma * tau * self.masks[step + 1] * gae - self.returns[step] = gae + self.value_preds[step] - else: - self.returns[-1] = next_value - for step in reversed(range(self.rewards.size(0))): - self.returns[step] = self.returns[step + 1] * gamma \ - * self.masks[step + 1] + self.rewards[step] - - def feed_forward_generator(self, advantages, num_mini_batch): - - num_steps, num_processes = self.rewards.size()[0:2] - batch_size = num_processes * num_steps - mini_batch_size = batch_size // num_mini_batch - assert batch_size >= num_mini_batch, ( - "PPO requires the number of processes ({}) " - "* number of steps ({}) = {} " - "to be greater than or equal to " - "the number of PPO mini batches ({})." - "".format(num_processes, num_steps, num_processes * num_steps, - num_mini_batch)) - - sampler = BatchSampler(SubsetRandomSampler(range(batch_size)), - mini_batch_size, drop_last=False) - - for indices in sampler: - yield { - 'obs': self.obs[:-1].view(-1, *self.obs.size()[2:])[indices], - 'rec_states': self.rec_states[:-1].view( - -1, self.rec_states.size(-1))[indices], - 'actions': self.actions.view(-1, self.n_actions)[indices], - 'value_preds': self.value_preds[:-1].view(-1)[indices], - 'returns': self.returns[:-1].view(-1)[indices], - 'masks': self.masks[:-1].view(-1)[indices], - 'old_action_log_probs': self.action_log_probs.view(-1)[indices], - 'adv_targ': advantages.view(-1)[indices], - 'extras': self.extras[:-1].view( - -1, self.extras_size)[indices] - if self.has_extras else None, - } - - def recurrent_generator(self, advantages, num_mini_batch): - - num_processes = self.rewards.size(1) - assert num_processes >= num_mini_batch, ( - "PPO requires the number of processes ({}) " - "to be greater than or equal to the number of " - "PPO mini batches ({}).".format(num_processes, num_mini_batch)) - num_envs_per_batch = num_processes // num_mini_batch - perm = torch.randperm(num_processes) - T, N = self.num_steps, num_envs_per_batch - - for start_ind in range(0, num_processes, num_envs_per_batch): - - obs = [] - rec_states = [] - actions = [] - value_preds = [] - returns = [] - masks = [] - old_action_log_probs = [] - adv_targ = [] - if self.has_extras: - extras = [] - - for offset in range(num_envs_per_batch): - - ind = perm[start_ind + offset] - obs.append(self.obs[:-1, ind]) - rec_states.append(self.rec_states[0:1, ind]) - actions.append(self.actions[:, ind]) - value_preds.append(self.value_preds[:-1, ind]) - returns.append(self.returns[:-1, ind]) - masks.append(self.masks[:-1, ind]) - old_action_log_probs.append(self.action_log_probs[:, ind]) - adv_targ.append(advantages[:, ind]) - if self.has_extras: - extras.append(self.extras[:-1, ind]) - - # These are all tensors of size (T, N, ...) - obs = torch.stack(obs, 1) - actions = torch.stack(actions, 1) - value_preds = torch.stack(value_preds, 1) - returns = torch.stack(returns, 1) - masks = torch.stack(masks, 1) - old_action_log_probs = torch.stack(old_action_log_probs, 1) - adv_targ = torch.stack(adv_targ, 1) - if self.has_extras: - extras = torch.stack(extras, 1) - - yield { - 'obs': _flatten_helper(T, N, obs), - 'actions': _flatten_helper(T, N, actions), - 'value_preds': _flatten_helper(T, N, value_preds), - 'returns': _flatten_helper(T, N, returns), - 'masks': _flatten_helper(T, N, masks), - 'old_action_log_probs': _flatten_helper( - T, N, old_action_log_probs), - 'adv_targ': _flatten_helper(T, N, adv_targ), - 'extras': _flatten_helper( - T, N, extras) if self.has_extras else None, - 'rec_states': torch.stack(rec_states, 1).view(N, -1), - } - - -class GlobalRolloutStorage(RolloutStorage): - - def __init__(self, num_steps, num_processes, obs_shape, action_space, - rec_state_size, extras_size): - super(GlobalRolloutStorage, self).__init__( - num_steps, num_processes, obs_shape, action_space, rec_state_size) - self.extras = torch.zeros((num_steps + 1, num_processes, extras_size), - dtype=torch.long) - self.has_extras = True - self.extras_size = extras_size - - def insert(self, obs, rec_states, actions, action_log_probs, value_preds, - rewards, masks, extras): - self.extras[self.step + 1].copy_(extras) - super(GlobalRolloutStorage, self).insert( - obs, rec_states, actions, - action_log_probs, value_preds, rewards, masks)