diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index 33e9f4a..0000000
--- a/Dockerfile
+++ /dev/null
@@ -1,65 +0,0 @@
-# Base image
-FROM nvidia/cudagl:10.1-devel-ubuntu16.04
-
-# Setup basic packages
-RUN apt-get update && apt-get install -y --no-install-recommends \
- build-essential \
- git \
- curl \
- vim \
- ca-certificates \
- libjpeg-dev \
- libpng-dev \
- libglfw3-dev \
- libglm-dev \
- libx11-dev \
- libomp-dev \
- libegl1-mesa-dev \
- pkg-config \
- wget \
- zip \
- htop \
- tmux \
- unzip &&\
- rm -rf /var/lib/apt/lists/*
-
-# Install conda
-RUN wget -O $HOME/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh &&\
- chmod +x ~/miniconda.sh &&\
- ~/miniconda.sh -b -p /custom/conda &&\
- rm ~/miniconda.sh &&\
- /custom/conda/bin/conda install numpy pyyaml scipy ipython mkl mkl-include &&\
- /custom/conda/bin/conda clean -ya
-ENV PATH /custom/conda/bin:$PATH
-
-# Install cmake
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.14.0/cmake-3.14.0-Linux-x86_64.sh
-RUN mkdir /opt/cmake
-RUN sh /cmake-3.14.0-Linux-x86_64.sh --prefix=/opt/cmake --skip-license
-RUN ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
-RUN cmake --version
-
-# Setup habitat-sim
-RUN git clone https://github.com/facebookresearch/habitat-sim.git
-RUN /bin/bash -c "cd habitat-sim; git checkout tags/v0.1.5; pip install -r requirements.txt; python setup.py install --headless --with-cuda"
-
-# Install challenge specific habitat-api
-RUN git clone https://github.com/facebookresearch/habitat-api.git
-RUN /bin/bash -c "cd habitat-api; git checkout tags/v0.1.5; pip install -e ."
-RUN /bin/bash -c "cd habitat-api; wget http://dl.fbaipublicfiles.com/habitat/habitat-test-scenes.zip; unzip habitat-test-scenes.zip"
-
-# Silence habitat-sim logs
-ENV GLOG_minloglevel=2
-ENV MAGNUM_LOG="quiet"
-
-# Install project specific packages
-RUN /bin/bash -c "apt-get update; apt-get install -y libsm6 libxext6 libxrender-dev; pip install opencv-python"
-RUN /bin/bash -c "pip install --upgrade cython numpy"
-RUN /bin/bash -c "pip install matplotlib seaborn==0.9.0 scikit-fmm==2019.1.30 scikit-image==0.15.0 imageio==2.6.0 scikit-learn==0.22.2.post1 ifcfg"
-
-# Install pytorch and torch_scatter
-RUN conda install pytorch=1.6.0 torchvision=0.7.0 cudatoolkit=10.2 -c pytorch
-RUN /bin/bash -c "pip install torch_scatter"
-
-# Install detectron2
-RUN /bin/bash -c "python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.6/index.html"
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 9f37f1a..0000000
--- a/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2020 Devendra Chaplot
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/README.md b/README.md
deleted file mode 100644
index e7a6f76..0000000
--- a/README.md
+++ /dev/null
@@ -1,155 +0,0 @@
-# Object Goal Navigation using Goal-Oriented Semantic Exploration
-This is a PyTorch implementation of the NeurIPS-20 paper:
-
-[Object Goal Navigation using Goal-Oriented Semantic Exploration](https://arxiv.org/pdf/2007.00643.pdf)
-Devendra Singh Chaplot, Dhiraj Gandhi, Abhinav Gupta, Ruslan Salakhutdinov
-Carnegie Mellon University, Facebook AI Research
-
-Winner of the [CVPR 2020 Habitat ObjectNav Challenge](https://aihabitat.org/challenge/2020/).
-
-Project Website: https://devendrachaplot.github.io/projects/semantic-exploration
-
-
-
-### Overview:
-The Goal-Oriented Semantic Exploration (SemExp) model consists of three modules: a Semantic Mapping Module, a Goal-Oriented Semantic Policy, and a deterministic Local Policy.
-As shown below, the Semantic Mapping model builds a semantic map over time. The Goal-Oriented Semantic Policy selects a long-term goal based on the semantic
-map to reach the given object goal efficiently. A deterministic local policy based on analytical planners is used to take low-level navigation actions to reach the long-term goal.
-
-
-
-### This repository contains:
-- Episode train and test datasets for [Object Goal Navigation](https://arxiv.org/pdf/2007.00643.pdf) task for the Gibson dataset in the Habitat Simulator.
-- The code to train and evaluate the Semantic Exploration (SemExp) model on the Object Goal Navigation task.
-- Pretrained SemExp model.
-
-## Installing Dependencies
-- We use earlier versions of [habitat-sim](https://github.com/facebookresearch/habitat-sim) and [habitat-lab](https://github.com/facebookresearch/habitat-lab) as specified below:
-
-Installing habitat-sim:
-```
-git clone https://github.com/facebookresearch/habitat-sim.git
-cd habitat-sim; git checkout tags/v0.1.5;
-pip install -r requirements.txt;
-python setup.py install --headless
-python setup.py install # (for Mac OS)
-```
-
-Installing habitat-lab:
-```
-git clone https://github.com/facebookresearch/habitat-lab.git
-cd habitat-lab; git checkout tags/v0.1.5;
-pip install -e .
-```
-Check habitat installation by running `python examples/benchmark.py` in the habitat-lab folder.
-
-- Install [pytorch](https://pytorch.org/) according to your system configuration. The code is tested on pytorch v1.6.0 and cudatoolkit v10.2. If you are using conda:
-```
-conda install pytorch==1.6.0 torchvision==0.7.0 cudatoolkit=10.2 #(Linux with GPU)
-conda install pytorch==1.6.0 torchvision==0.7.0 -c pytorch #(Mac OS)
-```
-
-- Install [detectron2](https://github.com/facebookresearch/detectron2/) according to your system configuration. If you are using conda:
-```
-python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.6/index.html #(Linux with GPU)
-CC=clang CXX=clang++ ARCHFLAGS="-arch x86_64" python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' #(Mac OS)
-```
-
-### Docker and Singularity images:
-We provide experimental [docker](https://www.docker.com/) and [singularity](https://sylabs.io/) images with all the dependencies installed, see [Docker Instructions](./docs/DOCKER_INSTRUCTIONS.md).
-
-
-## Setup
-Clone the repository and install other requirements:
-```
-git clone https://github.com/devendrachaplot/Object-Goal-Navigation/
-cd Object-Goal-Navigation/;
-pip install -r requirements.txt
-```
-
-### Downloading scene dataset
-- Download the Gibson dataset using the instructions here: https://github.com/facebookresearch/habitat-lab#scenes-datasets (download the 11GB file `gibson_habitat_trainval.zip`)
-- Move the Gibson scene dataset or create a symlink at `data/scene_datasets/gibson_semantic`.
-
-### Downloading episode dataset
-- Download the episode dataset:
-```
-wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1tslnZAkH8m3V5nP8pbtBmaR2XEfr8Rau' -O objectnav_gibson_v1.1.zip
-```
-- Unzip the dataset into `data/datasets/objectnav/gibson/v1.1/`
-
-### Setting up datasets
-The code requires the datasets in a `data` folder in the following format (same as habitat-lab):
-```
-Object-Goal-Navigation/
- data/
- scene_datasets/
- gibson_semantic/
- Adrian.glb
- Adrian.navmesh
- ...
- datasets/
- objectnav/
- gibson/
- v1.1/
- train/
- val/
-```
-
-
-### Test setup
-To verify that the data is setup correctly, run:
-```
-python test.py --agent random -n1 --num_eval_episodes 1 --auto_gpu_config 0
-```
-
-## Usage
-
-### Training:
-For training the SemExp model on the Object Goal Navigation task:
-```
-python main.py
-```
-
-### Downloading pre-trained models
-```
-mkdir pretrained_models;
-wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=171ZA7XNu5vi3XLpuKs8DuGGZrYyuSjL0' -O pretrained_models/sem_exp.pth
-```
-
-### For evaluation:
-For evaluating the pre-trained model:
-```
-python main.py --split val --eval 1 --load pretrained_models/sem_exp.pth
-```
-
-For visualizing the agent observations and predicted semantic map, add `-v 1` as an argument to the above command.
-
-The pre-trained model should get 0.657 Success, 0.339 SPL and 1.474 DTG.
-
-For more detailed instructions, see [INSTRUCTIONS](./docs/INSTRUCTIONS.md).
-
-
-## Cite as
->Chaplot, D.S., Gandhi, D., Gupta, A. and Salakhutdinov, R., 2020. Object Goal Navigation using Goal-Oriented Semantic Exploration. In Neural Information Processing Systems (NeurIPS-20). ([PDF](https://arxiv.org/pdf/2007.00643.pdf))
-
-### Bibtex:
-```
-@inproceedings{chaplot2020object,
- title={Object Goal Navigation using Goal-Oriented Semantic Exploration},
- author={Chaplot, Devendra Singh and Gandhi, Dhiraj and
- Gupta, Abhinav and Salakhutdinov, Ruslan},
- booktitle={In Neural Information Processing Systems (NeurIPS)},
- year={2020}
- }
-```
-
-## Related Projects
-- This project builds on the [Active Neural SLAM](https://devendrachaplot.github.io/projects/Neural-SLAM) paper. The code and pretrained models for the Active Neural SLAM system are available at:
-https://github.com/devendrachaplot/Neural-SLAM.
-- The Semantic Mapping module is similar to the one used in [Semantic Curiosity](https://devendrachaplot.github.io/projects/SemanticCuriosity).
-
-## Acknowledgements
-This repository uses [Habitat Lab](https://github.com/facebookresearch/habitat-lab) implementation for running the RL environment.
-The implementation of PPO is borrowed from [ikostrikov/pytorch-a2c-ppo-acktr-gail](https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/).
-The Mask-RCNN implementation is based on the [detectron2](https://github.com/facebookresearch/detectron2/) repository. We would also like to thank Shubham Tulsiani and Saurabh Gupta for their help in implementing some parts of the code.
diff --git a/agents/sem_exp.py b/agents/sem_exp.py
deleted file mode 100644
index 80283fc..0000000
--- a/agents/sem_exp.py
+++ /dev/null
@@ -1,416 +0,0 @@
-import math
-import os
-import cv2
-import numpy as np
-import skimage.morphology
-from PIL import Image
-from torchvision import transforms
-
-from envs.utils.fmm_planner import FMMPlanner
-from envs.habitat.objectgoal_env import ObjectGoal_Env
-from agents.utils.semantic_prediction import SemanticPredMaskRCNN
-from constants import color_palette
-import envs.utils.pose as pu
-import agents.utils.visualization as vu
-
-
-class Sem_Exp_Env_Agent(ObjectGoal_Env):
- """The Sem_Exp environment agent class. A seperate Sem_Exp_Env_Agent class
- object is used for each environment thread.
-
- """
-
- def __init__(self, args, rank, config_env, dataset):
-
- self.args = args
- super().__init__(args, rank, config_env, dataset)
-
- # initialize transform for RGB observations
- self.res = transforms.Compose(
- [transforms.ToPILImage(),
- transforms.Resize((args.frame_height, args.frame_width),
- interpolation=Image.NEAREST)])
-
- # initialize semantic segmentation prediction model
- if args.sem_gpu_id == -1:
- args.sem_gpu_id = config_env.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID
-
- self.sem_pred = SemanticPredMaskRCNN(args)
-
- # initializations for planning:
- self.selem = skimage.morphology.disk(3)
-
- self.obs = None
- self.obs_shape = None
- self.collision_map = None
- self.visited = None
- self.visited_vis = None
- self.col_width = None
- self.curr_loc = None
- self.last_loc = None
- self.last_action = None
- self.count_forward_actions = None
-
- if args.visualize or args.print_images:
- self.legend = cv2.imread('docs/legend.png')
- self.vis_image = None
- self.rgb_vis = None
-
- def reset(self):
- args = self.args
-
- obs, info = super().reset()
- obs = self._preprocess_obs(obs)
-
- self.obs_shape = obs.shape
-
- # Episode initializations
- map_shape = (args.map_size_cm // args.map_resolution,
- args.map_size_cm // args.map_resolution)
- self.collision_map = np.zeros(map_shape)
- self.visited = np.zeros(map_shape)
- self.visited_vis = np.zeros(map_shape)
- self.col_width = 1
- self.count_forward_actions = 0
- self.curr_loc = [args.map_size_cm / 100.0 / 2.0,
- args.map_size_cm / 100.0 / 2.0, 0.]
- self.last_action = None
-
- if args.visualize or args.print_images:
- self.vis_image = vu.init_vis_image(self.goal_name, self.legend)
-
- return obs, info
-
- def plan_act_and_preprocess(self, planner_inputs):
- """Function responsible for planning, taking the action and
- preprocessing observations
-
- Args:
- planner_inputs (dict):
- dict with following keys:
- 'map_pred' (ndarray): (M, M) map prediction
- 'goal' (ndarray): (M, M) mat denoting goal locations
- 'pose_pred' (ndarray): (7,) array denoting pose (x,y,o)
- and planning window (gx1, gx2, gy1, gy2)
- 'found_goal' (bool): whether the goal object is found
-
- Returns:
- obs (ndarray): preprocessed observations ((4+C) x H x W)
- reward (float): amount of reward returned after previous action
- done (bool): whether the episode has ended
- info (dict): contains timestep, pose, goal category and
- evaluation metric info
- """
-
- # plan
- if planner_inputs["wait"]:
- self.last_action = None
- self.info["sensor_pose"] = [0., 0., 0.]
- return np.zeros(self.obs.shape), 0., False, self.info
-
- # Reset reward if new long-term goal
- if planner_inputs["new_goal"]:
- self.info["g_reward"] = 0
-
- action = self._plan(planner_inputs)
-
- if self.args.visualize or self.args.print_images:
- self._visualize(planner_inputs)
-
- if action >= 0:
-
- # act
- action = {'action': action}
- obs, rew, done, info = super().step(action)
-
- # preprocess obs
- obs = self._preprocess_obs(obs)
- self.last_action = action['action']
- self.obs = obs
- self.info = info
-
- info['g_reward'] += rew
-
- return obs, rew, done, info
-
- else:
- self.last_action = None
- self.info["sensor_pose"] = [0., 0., 0.]
- return np.zeros(self.obs_shape), 0., False, self.info
-
- def _plan(self, planner_inputs):
- """Function responsible for planning
-
- Args:
- planner_inputs (dict):
- dict with following keys:
- 'map_pred' (ndarray): (M, M) map prediction
- 'goal' (ndarray): (M, M) goal locations
- 'pose_pred' (ndarray): (7,) array denoting pose (x,y,o)
- and planning window (gx1, gx2, gy1, gy2)
- 'found_goal' (bool): whether the goal object is found
-
- Returns:
- action (int): action id
- """
- args = self.args
-
- self.last_loc = self.curr_loc
-
- # Get Map prediction
- map_pred = np.rint(planner_inputs['map_pred'])
- goal = planner_inputs['goal']
-
- # Get pose prediction and global policy planning window
- start_x, start_y, start_o, gx1, gx2, gy1, gy2 = \
- planner_inputs['pose_pred']
- gx1, gx2, gy1, gy2 = int(gx1), int(gx2), int(gy1), int(gy2)
- planning_window = [gx1, gx2, gy1, gy2]
-
- # Get curr loc
- self.curr_loc = [start_x, start_y, start_o]
- r, c = start_y, start_x
- start = [int(r * 100.0 / args.map_resolution - gx1),
- int(c * 100.0 / args.map_resolution - gy1)]
- start = pu.threshold_poses(start, map_pred.shape)
-
- self.visited[gx1:gx2, gy1:gy2][start[0] - 0:start[0] + 1,
- start[1] - 0:start[1] + 1] = 1
-
- if args.visualize or args.print_images:
- # Get last loc
- last_start_x, last_start_y = self.last_loc[0], self.last_loc[1]
- r, c = last_start_y, last_start_x
- last_start = [int(r * 100.0 / args.map_resolution - gx1),
- int(c * 100.0 / args.map_resolution - gy1)]
- last_start = pu.threshold_poses(last_start, map_pred.shape)
- self.visited_vis[gx1:gx2, gy1:gy2] = \
- vu.draw_line(last_start, start,
- self.visited_vis[gx1:gx2, gy1:gy2])
-
- # Collision check
- if self.last_action == 1:
- x1, y1, t1 = self.last_loc
- x2, y2, _ = self.curr_loc
- buf = 4
- length = 2
-
- if abs(x1 - x2) < 0.05 and abs(y1 - y2) < 0.05:
- self.col_width += 2
- if self.col_width == 7:
- length = 4
- buf = 3
- self.col_width = min(self.col_width, 5)
- else:
- self.col_width = 1
-
- dist = pu.get_l2_distance(x1, x2, y1, y2)
- if dist < args.collision_threshold: # Collision
- width = self.col_width
- for i in range(length):
- for j in range(width):
- wx = x1 + 0.05 * \
- ((i + buf) * np.cos(np.deg2rad(t1))
- + (j - width // 2) * np.sin(np.deg2rad(t1)))
- wy = y1 + 0.05 * \
- ((i + buf) * np.sin(np.deg2rad(t1))
- - (j - width // 2) * np.cos(np.deg2rad(t1)))
- r, c = wy, wx
- r, c = int(r * 100 / args.map_resolution), \
- int(c * 100 / args.map_resolution)
- [r, c] = pu.threshold_poses([r, c],
- self.collision_map.shape)
- self.collision_map[r, c] = 1
-
- stg, stop = self._get_stg(map_pred, start, np.copy(goal),
- planning_window)
-
- # Deterministic Local Policy
- if stop and planner_inputs['found_goal'] == 1:
- action = 0 # Stop
- else:
- (stg_x, stg_y) = stg
- angle_st_goal = math.degrees(math.atan2(stg_x - start[0],
- stg_y - start[1]))
- angle_agent = (start_o) % 360.0
- if angle_agent > 180:
- angle_agent -= 360
-
- relative_angle = (angle_agent - angle_st_goal) % 360.0
- if relative_angle > 180:
- relative_angle -= 360
-
- if relative_angle > self.args.turn_angle / 2.:
- action = 3 # Right
- elif relative_angle < -self.args.turn_angle / 2.:
- action = 2 # Left
- else:
- action = 1 # Forward
-
- return action
-
- def _get_stg(self, grid, start, goal, planning_window):
- """Get short-term goal"""
-
- [gx1, gx2, gy1, gy2] = planning_window
-
- x1, y1, = 0, 0
- x2, y2 = grid.shape
-
- def add_boundary(mat, value=1):
- h, w = mat.shape
- new_mat = np.zeros((h + 2, w + 2)) + value
- new_mat[1:h + 1, 1:w + 1] = mat
- return new_mat
-
- traversible = skimage.morphology.binary_dilation(
- grid[x1:x2, y1:y2],
- self.selem) != True
- traversible[self.collision_map[gx1:gx2, gy1:gy2]
- [x1:x2, y1:y2] == 1] = 0
- traversible[self.visited[gx1:gx2, gy1:gy2][x1:x2, y1:y2] == 1] = 1
-
- traversible[int(start[0] - x1) - 1:int(start[0] - x1) + 2,
- int(start[1] - y1) - 1:int(start[1] - y1) + 2] = 1
-
- traversible = add_boundary(traversible)
- goal = add_boundary(goal, value=0)
-
- planner = FMMPlanner(traversible)
- selem = skimage.morphology.disk(10)
- goal = skimage.morphology.binary_dilation(
- goal, selem) != True
- goal = 1 - goal * 1.
- planner.set_multi_goal(goal)
-
- state = [start[0] - x1 + 1, start[1] - y1 + 1]
- stg_x, stg_y, _, stop = planner.get_short_term_goal(state)
-
- stg_x, stg_y = stg_x + x1 - 1, stg_y + y1 - 1
-
- return (stg_x, stg_y), stop
-
- def _preprocess_obs(self, obs, use_seg=True):
- args = self.args
- obs = obs.transpose(1, 2, 0)
- rgb = obs[:, :, :3]
- depth = obs[:, :, 3:4]
-
- sem_seg_pred = self._get_sem_pred(
- rgb.astype(np.uint8), use_seg=use_seg)
- depth = self._preprocess_depth(depth, args.min_depth, args.max_depth)
-
- ds = args.env_frame_width // args.frame_width # Downscaling factor
- if ds != 1:
- rgb = np.asarray(self.res(rgb.astype(np.uint8)))
- depth = depth[ds // 2::ds, ds // 2::ds]
- sem_seg_pred = sem_seg_pred[ds // 2::ds, ds // 2::ds]
-
- depth = np.expand_dims(depth, axis=2)
- state = np.concatenate((rgb, depth, sem_seg_pred),
- axis=2).transpose(2, 0, 1)
-
- return state
-
- def _preprocess_depth(self, depth, min_d, max_d):
- depth = depth[:, :, 0] * 1
-
- for i in range(depth.shape[1]):
- depth[:, i][depth[:, i] == 0.] = depth[:, i].max()
-
- mask2 = depth > 0.99
- depth[mask2] = 0.
-
- mask1 = depth == 0
- depth[mask1] = 100.0
- depth = min_d * 100.0 + depth * max_d * 100.0
- return depth
-
- def _get_sem_pred(self, rgb, use_seg=True):
- if use_seg:
- semantic_pred, self.rgb_vis = self.sem_pred.get_prediction(rgb)
- semantic_pred = semantic_pred.astype(np.float32)
- else:
- semantic_pred = np.zeros((rgb.shape[0], rgb.shape[1], 16))
- self.rgb_vis = rgb[:, :, ::-1]
- return semantic_pred
-
- def _visualize(self, inputs):
- args = self.args
- dump_dir = "{}/dump/{}/".format(args.dump_location,
- args.exp_name)
- ep_dir = '{}/episodes/thread_{}/eps_{}/'.format(
- dump_dir, self.rank, self.episode_no)
- if not os.path.exists(ep_dir):
- os.makedirs(ep_dir)
-
- map_pred = inputs['map_pred']
- exp_pred = inputs['exp_pred']
- start_x, start_y, start_o, gx1, gx2, gy1, gy2 = inputs['pose_pred']
-
- goal = inputs['goal']
- sem_map = inputs['sem_map_pred']
-
- gx1, gx2, gy1, gy2 = int(gx1), int(gx2), int(gy1), int(gy2)
-
- sem_map += 5
-
- no_cat_mask = sem_map == 20
- map_mask = np.rint(map_pred) == 1
- exp_mask = np.rint(exp_pred) == 1
- vis_mask = self.visited_vis[gx1:gx2, gy1:gy2] == 1
-
- sem_map[no_cat_mask] = 0
- m1 = np.logical_and(no_cat_mask, exp_mask)
- sem_map[m1] = 2
-
- m2 = np.logical_and(no_cat_mask, map_mask)
- sem_map[m2] = 1
-
- sem_map[vis_mask] = 3
-
- selem = skimage.morphology.disk(4)
- goal_mat = 1 - skimage.morphology.binary_dilation(
- goal, selem) != True
-
- goal_mask = goal_mat == 1
- sem_map[goal_mask] = 4
-
- color_pal = [int(x * 255.) for x in color_palette]
- sem_map_vis = Image.new("P", (sem_map.shape[1],
- sem_map.shape[0]))
- sem_map_vis.putpalette(color_pal)
- sem_map_vis.putdata(sem_map.flatten().astype(np.uint8))
- sem_map_vis = sem_map_vis.convert("RGB")
- sem_map_vis = np.flipud(sem_map_vis)
-
- sem_map_vis = sem_map_vis[:, :, [2, 1, 0]]
- sem_map_vis = cv2.resize(sem_map_vis, (480, 480),
- interpolation=cv2.INTER_NEAREST)
- self.vis_image[50:530, 15:655] = self.rgb_vis
- self.vis_image[50:530, 670:1150] = sem_map_vis
-
- pos = (
- (start_x * 100. / args.map_resolution - gy1)
- * 480 / map_pred.shape[0],
- (map_pred.shape[1] - start_y * 100. / args.map_resolution + gx1)
- * 480 / map_pred.shape[1],
- np.deg2rad(-start_o)
- )
-
- agent_arrow = vu.get_contour_points(pos, origin=(670, 50))
- color = (int(color_palette[11] * 255),
- int(color_palette[10] * 255),
- int(color_palette[9] * 255))
- cv2.drawContours(self.vis_image, [agent_arrow], 0, color, -1)
-
- if args.visualize:
- # Displaying the image
- cv2.imshow("Thread {}".format(self.rank), self.vis_image)
- cv2.waitKey(1)
-
- if args.print_images:
- fn = '{}/episodes/thread_{}/eps_{}/{}-{}-Vis-{}.png'.format(
- dump_dir, self.rank, self.episode_no,
- self.rank, self.episode_no, self.timestep)
- cv2.imwrite(fn, self.vis_image)
diff --git a/algo/__init__.py b/algo/__init__.py
deleted file mode 100644
index 91ac6f5..0000000
--- a/algo/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .ppo import PPO
diff --git a/algo/ppo.py b/algo/ppo.py
deleted file mode 100644
index e2ea796..0000000
--- a/algo/ppo.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# The following code is largely borrowed from:
-# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/algo/ppo.py
-
-import torch
-import torch.nn as nn
-import torch.optim as optim
-
-
-class PPO():
-
- def __init__(
- self,
- actor_critic,
- clip_param,
- ppo_epoch,
- num_mini_batch,
- value_loss_coef,
- entropy_coef,
- lr=None,
- eps=None,
- max_grad_norm=None,
- use_clipped_value_loss=True):
-
- self.actor_critic = actor_critic
-
- self.clip_param = clip_param
- self.ppo_epoch = ppo_epoch
- self.num_mini_batch = num_mini_batch
-
- self.value_loss_coef = value_loss_coef
- self.entropy_coef = entropy_coef
-
- self.max_grad_norm = max_grad_norm
- self.use_clipped_value_loss = use_clipped_value_loss
-
- self.optimizer = optim.Adam(filter(lambda p: p.requires_grad,
- actor_critic.parameters()),
- lr=lr, eps=eps)
-
- def update(self, rollouts):
- advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
- advantages = (advantages - advantages.mean()) / (
- advantages.std() + 1e-5)
-
- value_loss_epoch = 0
- action_loss_epoch = 0
- dist_entropy_epoch = 0
-
- for _ in range(self.ppo_epoch):
-
- if self.actor_critic.is_recurrent:
- data_generator = rollouts.recurrent_generator(
- advantages, self.num_mini_batch)
- else:
- data_generator = rollouts.feed_forward_generator(
- advantages, self.num_mini_batch)
-
- for sample in data_generator:
-
- value_preds = sample['value_preds']
- returns = sample['returns']
- adv_targ = sample['adv_targ']
-
- # Reshape to do in a single forward pass for all steps
- values, action_log_probs, dist_entropy, _ = \
- self.actor_critic.evaluate_actions(
- sample['obs'], sample['rec_states'],
- sample['masks'], sample['actions'],
- extras=sample['extras']
- )
-
- ratio = torch.exp(action_log_probs -
- sample['old_action_log_probs'])
- surr1 = ratio * adv_targ
- surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
- 1.0 + self.clip_param) * adv_targ
- action_loss = -torch.min(surr1, surr2).mean()
-
- if self.use_clipped_value_loss:
- value_pred_clipped = value_preds + \
- (values - value_preds).clamp(
- -self.clip_param, self.clip_param)
- value_losses = (values - returns).pow(2)
- value_losses_clipped = (value_pred_clipped
- - returns).pow(2)
- value_loss = .5 * torch.max(value_losses,
- value_losses_clipped).mean()
- else:
- value_loss = 0.5 * (returns - values).pow(2).mean()
-
- self.optimizer.zero_grad()
- (value_loss * self.value_loss_coef + action_loss -
- dist_entropy * self.entropy_coef).backward()
- nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
- self.max_grad_norm)
- self.optimizer.step()
-
- value_loss_epoch += value_loss.item()
- action_loss_epoch += action_loss.item()
- dist_entropy_epoch += dist_entropy.item()
-
- num_updates = self.ppo_epoch * self.num_mini_batch
-
- value_loss_epoch /= num_updates
- action_loss_epoch /= num_updates
- dist_entropy_epoch /= num_updates
-
- return value_loss_epoch, action_loss_epoch, dist_entropy_epoch
diff --git a/arguments.py b/arguments.py
deleted file mode 100644
index baafb20..0000000
--- a/arguments.py
+++ /dev/null
@@ -1,230 +0,0 @@
-import argparse
-import torch
-
-
-def get_args():
- parser = argparse.ArgumentParser(
- description='Goal-Oriented-Semantic-Exploration')
-
- # General Arguments
- parser.add_argument('--seed', type=int, default=1,
- help='random seed (default: 1)')
- parser.add_argument('--auto_gpu_config', type=int, default=1)
- parser.add_argument('--total_num_scenes', type=str, default="auto")
- parser.add_argument('-n', '--num_processes', type=int, default=5,
- help="""how many training processes to use (default:5)
- Overridden when auto_gpu_config=1
- and training on gpus""")
- parser.add_argument('--num_processes_per_gpu', type=int, default=6)
- parser.add_argument('--num_processes_on_first_gpu', type=int, default=1)
- parser.add_argument('--eval', type=int, default=0,
- help='0: Train, 1: Evaluate (default: 0)')
- parser.add_argument('--num_training_frames', type=int, default=10000000,
- help='total number of training frames')
- parser.add_argument('--num_eval_episodes', type=int, default=200,
- help="number of test episodes per scene")
- parser.add_argument('--num_train_episodes', type=int, default=10000,
- help="""number of train episodes per scene
- before loading the next scene""")
- parser.add_argument('--no_cuda', action='store_true', default=False,
- help='disables CUDA training')
- parser.add_argument("--sim_gpu_id", type=int, default=0,
- help="gpu id on which scenes are loaded")
- parser.add_argument("--sem_gpu_id", type=int, default=-1,
- help="""gpu id for semantic model,
- -1: same as sim gpu, -2: cpu""")
-
- # Logging, loading models, visualization
- parser.add_argument('--log_interval', type=int, default=10,
- help="""log interval, one log per n updates
- (default: 10) """)
- parser.add_argument('--save_interval', type=int, default=1,
- help="""save interval""")
- parser.add_argument('-d', '--dump_location', type=str, default="./tmp/",
- help='path to dump models and log (default: ./tmp/)')
- parser.add_argument('--exp_name', type=str, default="exp1",
- help='experiment name (default: exp1)')
- parser.add_argument('--save_periodic', type=int, default=500000,
- help='Model save frequency in number of updates')
- parser.add_argument('--load', type=str, default="0",
- help="""model path to load,
- 0 to not reload (default: 0)""")
- parser.add_argument('-v', '--visualize', type=int, default=0,
- help="""1: Render the observation and
- the predicted semantic map,
- 2: Render the observation with semantic
- predictions and the predicted semantic map
- (default: 0)""")
- parser.add_argument('--print_images', type=int, default=0,
- help='1: save visualization as images')
-
- # Environment, dataset and episode specifications
- parser.add_argument('-efw', '--env_frame_width', type=int, default=640,
- help='Frame width (default:640)')
- parser.add_argument('-efh', '--env_frame_height', type=int, default=480,
- help='Frame height (default:480)')
- parser.add_argument('-fw', '--frame_width', type=int, default=160,
- help='Frame width (default:160)')
- parser.add_argument('-fh', '--frame_height', type=int, default=120,
- help='Frame height (default:120)')
- parser.add_argument('-el', '--max_episode_length', type=int, default=500,
- help="""Maximum episode length""")
- parser.add_argument("--task_config", type=str,
- default="tasks/objectnav_gibson.yaml",
- help="path to config yaml containing task information")
- parser.add_argument("--split", type=str, default="train",
- help="dataset split (train | val | val_mini) ")
- parser.add_argument('--camera_height', type=float, default=0.88,
- help="agent camera height in metres")
- parser.add_argument('--hfov', type=float, default=79.0,
- help="horizontal field of view in degrees")
- parser.add_argument('--turn_angle', type=float, default=30,
- help="Agent turn angle in degrees")
- parser.add_argument('--min_depth', type=float, default=0.5,
- help="Minimum depth for depth sensor in meters")
- parser.add_argument('--max_depth', type=float, default=5.0,
- help="Maximum depth for depth sensor in meters")
- parser.add_argument('--success_dist', type=float, default=1.0,
- help="success distance threshold in meters")
- parser.add_argument('--floor_thr', type=int, default=50,
- help="floor threshold in cm")
- parser.add_argument('--min_d', type=float, default=1.5,
- help="min distance to goal during training in meters")
- parser.add_argument('--max_d', type=float, default=100.0,
- help="max distance to goal during training in meters")
- parser.add_argument('--version', type=str, default="v1.1",
- help="dataset version")
-
- # Model Hyperparameters
- parser.add_argument('--agent', type=str, default="sem_exp")
- parser.add_argument('--lr', type=float, default=2.5e-5,
- help='learning rate (default: 2.5e-5)')
- parser.add_argument('--global_hidden_size', type=int, default=256,
- help='global_hidden_size')
- parser.add_argument('--eps', type=float, default=1e-5,
- help='RL Optimizer epsilon (default: 1e-5)')
- parser.add_argument('--alpha', type=float, default=0.99,
- help='RL Optimizer alpha (default: 0.99)')
- parser.add_argument('--gamma', type=float, default=0.99,
- help='discount factor for rewards (default: 0.99)')
- parser.add_argument('--use_gae', action='store_true', default=False,
- help='use generalized advantage estimation')
- parser.add_argument('--tau', type=float, default=0.95,
- help='gae parameter (default: 0.95)')
- parser.add_argument('--entropy_coef', type=float, default=0.001,
- help='entropy term coefficient (default: 0.01)')
- parser.add_argument('--value_loss_coef', type=float, default=0.5,
- help='value loss coefficient (default: 0.5)')
- parser.add_argument('--max_grad_norm', type=float, default=0.5,
- help='max norm of gradients (default: 0.5)')
- parser.add_argument('--num_global_steps', type=int, default=20,
- help='number of forward steps in A2C (default: 5)')
- parser.add_argument('--ppo_epoch', type=int, default=4,
- help='number of ppo epochs (default: 4)')
- parser.add_argument('--num_mini_batch', type=str, default="auto",
- help='number of batches for ppo (default: 32)')
- parser.add_argument('--clip_param', type=float, default=0.2,
- help='ppo clip parameter (default: 0.2)')
- parser.add_argument('--use_recurrent_global', type=int, default=0,
- help='use a recurrent global policy')
- parser.add_argument('--num_local_steps', type=int, default=25,
- help="""Number of steps the local policy
- between each global step""")
- parser.add_argument('--reward_coeff', type=float, default=0.1,
- help="Object goal reward coefficient")
- parser.add_argument('--intrinsic_rew_coeff', type=float, default=0.02,
- help="intrinsic exploration reward coefficient")
- parser.add_argument('--num_sem_categories', type=float, default=16)
- parser.add_argument('--sem_pred_prob_thr', type=float, default=0.9,
- help="Semantic prediction confidence threshold")
-
- # Mapping
- parser.add_argument('--global_downscaling', type=int, default=2)
- parser.add_argument('--vision_range', type=int, default=100)
- parser.add_argument('--map_resolution', type=int, default=5)
- parser.add_argument('--du_scale', type=int, default=1)
- parser.add_argument('--map_size_cm', type=int, default=2400)
- parser.add_argument('--cat_pred_threshold', type=float, default=5.0)
- parser.add_argument('--map_pred_threshold', type=float, default=1.0)
- parser.add_argument('--exp_pred_threshold', type=float, default=1.0)
- parser.add_argument('--collision_threshold', type=float, default=0.20)
-
- # parse arguments
- args = parser.parse_args()
-
- args.cuda = not args.no_cuda and torch.cuda.is_available()
-
- if args.cuda:
- if args.auto_gpu_config:
- num_gpus = torch.cuda.device_count()
- if args.total_num_scenes != "auto":
- args.total_num_scenes = int(args.total_num_scenes)
- elif "objectnav_gibson" in args.task_config and \
- "train" in args.split:
- args.total_num_scenes = 25
- elif "objectnav_gibson" in args.task_config and \
- "val" in args.split:
- args.total_num_scenes = 5
- else:
- assert False, "Unknown task config, please specify" + \
- " total_num_scenes"
-
- # GPU Memory required for the SemExp model:
- # 0.8 + 0.4 * args.total_num_scenes (GB)
- # GPU Memory required per thread: 2.6 (GB)
- min_memory_required = max(0.8 + 0.4 * args.total_num_scenes, 2.6)
- # Automatically configure number of training threads based on
- # number of GPUs available and GPU memory size
- gpu_memory = 1000
- for i in range(num_gpus):
- gpu_memory = min(gpu_memory,
- torch.cuda.get_device_properties(
- i).total_memory
- / 1024 / 1024 / 1024)
- assert gpu_memory > min_memory_required, \
- """Insufficient GPU memory for GPU {}, gpu memory ({}GB)
- needs to be greater than {}GB""".format(
- i, gpu_memory, min_memory_required)
-
- num_processes_per_gpu = int(gpu_memory / 2.6)
- num_processes_on_first_gpu = \
- int((gpu_memory - min_memory_required) / 2.6)
-
- if args.eval:
- max_threads = num_processes_per_gpu * (num_gpus - 1) \
- + num_processes_on_first_gpu
- assert max_threads >= args.total_num_scenes, \
- """Insufficient GPU memory for evaluation"""
-
- if num_gpus == 1:
- args.num_processes_on_first_gpu = num_processes_on_first_gpu
- args.num_processes_per_gpu = 0
- args.num_processes = num_processes_on_first_gpu
- assert args.num_processes > 0, "Insufficient GPU memory"
- else:
- num_threads = num_processes_per_gpu * (num_gpus - 1) \
- + num_processes_on_first_gpu
- num_threads = min(num_threads, args.total_num_scenes)
- args.num_processes_per_gpu = num_processes_per_gpu
- args.num_processes_on_first_gpu = max(
- 0,
- num_threads - args.num_processes_per_gpu * (num_gpus - 1))
- args.num_processes = num_threads
-
- args.sim_gpu_id = 1
-
- print("Auto GPU config:")
- print("Number of processes: {}".format(args.num_processes))
- print("Number of processes on GPU 0: {}".format(
- args.num_processes_on_first_gpu))
- print("Number of processes per GPU: {}".format(
- args.num_processes_per_gpu))
- else:
- args.sem_gpu_id = -2
-
- if args.num_mini_batch == "auto":
- args.num_mini_batch = max(args.num_processes // 2, 1)
- else:
- args.num_mini_batch = int(args.num_mini_batch)
-
- return args
diff --git a/configs/Base-RCNN-FPN.yaml b/configs/Base-RCNN-FPN.yaml
deleted file mode 100644
index 3e020f2..0000000
--- a/configs/Base-RCNN-FPN.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-MODEL:
- META_ARCHITECTURE: "GeneralizedRCNN"
- BACKBONE:
- NAME: "build_resnet_fpn_backbone"
- RESNETS:
- OUT_FEATURES: ["res2", "res3", "res4", "res5"]
- FPN:
- IN_FEATURES: ["res2", "res3", "res4", "res5"]
- ANCHOR_GENERATOR:
- SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
- ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
- RPN:
- IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
- PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
- PRE_NMS_TOPK_TEST: 1000 # Per FPN level
- # Detectron1 uses 2000 proposals per-batch,
- # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
- # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
- POST_NMS_TOPK_TRAIN: 1000
- POST_NMS_TOPK_TEST: 1000
- ROI_HEADS:
- NAME: "StandardROIHeads"
- IN_FEATURES: ["p2", "p3", "p4", "p5"]
- ROI_BOX_HEAD:
- NAME: "FastRCNNConvFCHead"
- NUM_FC: 2
- POOLER_RESOLUTION: 7
- ROI_MASK_HEAD:
- NAME: "MaskRCNNConvUpsampleHead"
- NUM_CONV: 4
- POOLER_RESOLUTION: 14
-DATASETS:
- TRAIN: ("coco_2017_train",)
- TEST: ("coco_2017_val",)
-SOLVER:
- IMS_PER_BATCH: 16
- BASE_LR: 0.02
- STEPS: (60000, 80000)
- MAX_ITER: 90000
-INPUT:
- MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
-VERSION: 2
diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml
deleted file mode 100644
index be7d06b..0000000
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-MODEL:
- WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
- MASK_ON: True
- RESNETS:
- DEPTH: 50
-SOLVER:
- STEPS: (210000, 250000)
- MAX_ITER: 270000
diff --git a/constants.py b/constants.py
deleted file mode 100644
index 1f0179e..0000000
--- a/constants.py
+++ /dev/null
@@ -1,94 +0,0 @@
-scenes = {}
-scenes["train"] = [
- 'Allensville',
- 'Beechwood',
- 'Benevolence',
- 'Coffeen',
- 'Cosmos',
- 'Forkland',
- 'Hanson',
- 'Hiteman',
- 'Klickitat',
- 'Lakeville',
- 'Leonardo',
- 'Lindenwood',
- 'Marstons',
- 'Merom',
- 'Mifflinburg',
- 'Newfields',
- 'Onaga',
- 'Pinesdale',
- 'Pomaria',
- 'Ranchester',
- 'Shelbyville',
- 'Stockman',
- 'Tolstoy',
- 'Wainscott',
- 'Woodbine',
-]
-
-scenes["val"] = [
- 'Collierville',
- 'Corozal',
- 'Darden',
- 'Markleeville',
- 'Wiconisco',
-]
-
-coco_categories = {
- "chair": 0,
- "couch": 1,
- "potted plant": 2,
- "bed": 3,
- "toilet": 4,
- "tv": 5,
- "dining-table": 6,
- "oven": 7,
- "sink": 8,
- "refrigerator": 9,
- "book": 10,
- "clock": 11,
- "vase": 12,
- "cup": 13,
- "bottle": 14
-}
-
-coco_categories_mapping = {
- 56: 0, # chair
- 57: 1, # couch
- 58: 2, # potted plant
- 59: 3, # bed
- 61: 4, # toilet
- 62: 5, # tv
- 60: 6, # dining-table
- 69: 7, # oven
- 71: 8, # sink
- 72: 9, # refrigerator
- 73: 10, # book
- 74: 11, # clock
- 75: 12, # vase
- 41: 13, # cup
- 39: 14, # bottle
-}
-
-color_palette = [
- 1.0, 1.0, 1.0,
- 0.6, 0.6, 0.6,
- 0.95, 0.95, 0.95,
- 0.96, 0.36, 0.26,
- 0.12156862745098039, 0.47058823529411764, 0.7058823529411765,
- 0.9400000000000001, 0.7818, 0.66,
- 0.9400000000000001, 0.8868, 0.66,
- 0.8882000000000001, 0.9400000000000001, 0.66,
- 0.7832000000000001, 0.9400000000000001, 0.66,
- 0.6782000000000001, 0.9400000000000001, 0.66,
- 0.66, 0.9400000000000001, 0.7468000000000001,
- 0.66, 0.9400000000000001, 0.8518000000000001,
- 0.66, 0.9232, 0.9400000000000001,
- 0.66, 0.8182, 0.9400000000000001,
- 0.66, 0.7132, 0.9400000000000001,
- 0.7117999999999999, 0.66, 0.9400000000000001,
- 0.8168, 0.66, 0.9400000000000001,
- 0.9218, 0.66, 0.9400000000000001,
- 0.9400000000000001, 0.66, 0.8531999999999998,
- 0.9400000000000001, 0.66, 0.748199999999999]
diff --git a/envs/habitat/__init__.py b/envs/habitat/__init__.py
deleted file mode 100644
index e04b9ed..0000000
--- a/envs/habitat/__init__.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Parts of the code in this file have been borrowed from:
-# https://github.com/facebookresearch/habitat-api
-import os
-import numpy as np
-import torch
-from habitat.config.default import get_config as cfg_env
-from habitat.datasets.pointnav.pointnav_dataset import PointNavDatasetV1
-from habitat import Config, Env, RLEnv, VectorEnv, make_dataset
-
-from agents.sem_exp import Sem_Exp_Env_Agent
-from .objectgoal_env import ObjectGoal_Env
-
-from .utils.vector_env import VectorEnv
-
-
-def make_env_fn(args, config_env, rank):
- dataset = make_dataset(config_env.DATASET.TYPE, config=config_env.DATASET)
- config_env.defrost()
- config_env.SIMULATOR.SCENE = dataset.episodes[0].scene_id
- config_env.freeze()
-
- if args.agent == "sem_exp":
- env = Sem_Exp_Env_Agent(args=args, rank=rank,
- config_env=config_env,
- dataset=dataset
- )
- else:
- env = ObjectGoal_Env(args=args, rank=rank,
- config_env=config_env,
- dataset=dataset
- )
-
- env.seed(rank)
- return env
-
-
-def _get_scenes_from_folder(content_dir):
- scene_dataset_ext = ".glb.json.gz"
- scenes = []
- for filename in os.listdir(content_dir):
- if filename.endswith(scene_dataset_ext):
- scene = filename[: -len(scene_dataset_ext) + 4]
- scenes.append(scene)
- scenes.sort()
- return scenes
-
-
-def construct_envs(args):
- env_configs = []
- args_list = []
-
- basic_config = cfg_env(config_paths=["envs/habitat/configs/"
- + args.task_config])
- basic_config.defrost()
- basic_config.DATASET.SPLIT = args.split
- basic_config.DATASET.DATA_PATH = \
- basic_config.DATASET.DATA_PATH.replace("v1", args.version)
- basic_config.DATASET.EPISODES_DIR = \
- basic_config.DATASET.EPISODES_DIR.replace("v1", args.version)
- basic_config.freeze()
-
- scenes = basic_config.DATASET.CONTENT_SCENES
- if "*" in basic_config.DATASET.CONTENT_SCENES:
- content_dir = os.path.join(basic_config.DATASET.EPISODES_DIR.format(
- split=args.split), "content")
- scenes = _get_scenes_from_folder(content_dir)
-
- if len(scenes) > 0:
- assert len(scenes) >= args.num_processes, (
- "reduce the number of processes as there "
- "aren't enough number of scenes"
- )
-
- scene_split_sizes = [int(np.floor(len(scenes) / args.num_processes))
- for _ in range(args.num_processes)]
- for i in range(len(scenes) % args.num_processes):
- scene_split_sizes[i] += 1
-
- print("Scenes per thread:")
- for i in range(args.num_processes):
- config_env = cfg_env(config_paths=["envs/habitat/configs/"
- + args.task_config])
- config_env.defrost()
-
- if len(scenes) > 0:
- config_env.DATASET.CONTENT_SCENES = scenes[
- sum(scene_split_sizes[:i]):
- sum(scene_split_sizes[:i + 1])
- ]
- print("Thread {}: {}".format(i, config_env.DATASET.CONTENT_SCENES))
-
- if i < args.num_processes_on_first_gpu:
- gpu_id = 0
- else:
- gpu_id = int((i - args.num_processes_on_first_gpu)
- // args.num_processes_per_gpu) + args.sim_gpu_id
- gpu_id = min(torch.cuda.device_count() - 1, gpu_id)
- config_env.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = gpu_id
-
- agent_sensors = []
- agent_sensors.append("RGB_SENSOR")
- agent_sensors.append("DEPTH_SENSOR")
- # agent_sensors.append("SEMANTIC_SENSOR")
-
- config_env.SIMULATOR.AGENT_0.SENSORS = agent_sensors
-
- # Reseting episodes manually, setting high max episode length in sim
- config_env.ENVIRONMENT.MAX_EPISODE_STEPS = 10000000
- config_env.ENVIRONMENT.ITERATOR_OPTIONS.SHUFFLE = False
-
- config_env.SIMULATOR.RGB_SENSOR.WIDTH = args.env_frame_width
- config_env.SIMULATOR.RGB_SENSOR.HEIGHT = args.env_frame_height
- config_env.SIMULATOR.RGB_SENSOR.HFOV = args.hfov
- config_env.SIMULATOR.RGB_SENSOR.POSITION = [0, args.camera_height, 0]
-
- config_env.SIMULATOR.DEPTH_SENSOR.WIDTH = args.env_frame_width
- config_env.SIMULATOR.DEPTH_SENSOR.HEIGHT = args.env_frame_height
- config_env.SIMULATOR.DEPTH_SENSOR.HFOV = args.hfov
- config_env.SIMULATOR.DEPTH_SENSOR.MIN_DEPTH = args.min_depth
- config_env.SIMULATOR.DEPTH_SENSOR.MAX_DEPTH = args.max_depth
- config_env.SIMULATOR.DEPTH_SENSOR.POSITION = [0, args.camera_height, 0]
-
- # config_env.SIMULATOR.SEMANTIC_SENSOR.WIDTH = args.env_frame_width
- # config_env.SIMULATOR.SEMANTIC_SENSOR.HEIGHT = args.env_frame_height
- # config_env.SIMULATOR.SEMANTIC_SENSOR.HFOV = args.hfov
- # config_env.SIMULATOR.SEMANTIC_SENSOR.POSITION = \
- # [0, args.camera_height, 0]
-
- config_env.SIMULATOR.TURN_ANGLE = args.turn_angle
- config_env.DATASET.SPLIT = args.split
- config_env.DATASET.DATA_PATH = \
- config_env.DATASET.DATA_PATH.replace("v1", args.version)
- config_env.DATASET.EPISODES_DIR = \
- config_env.DATASET.EPISODES_DIR.replace("v1", args.version)
-
- config_env.freeze()
- env_configs.append(config_env)
-
- args_list.append(args)
-
- envs = VectorEnv(
- make_env_fn=make_env_fn,
- env_fn_args=tuple(
- tuple(
- zip(args_list, env_configs, range(args.num_processes))
- )
- ),
- )
-
- return envs
diff --git a/envs/habitat/configs/tasks/objectnav_gibson.yaml b/envs/habitat/configs/tasks/objectnav_gibson.yaml
deleted file mode 100644
index d0b7d92..0000000
--- a/envs/habitat/configs/tasks/objectnav_gibson.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-ENVIRONMENT:
- MAX_EPISODE_STEPS: 500
-SIMULATOR:
- TURN_ANGLE: 30
- TILT_ANGLE: 30
- ACTION_SPACE_CONFIG: "v1"
- AGENT_0:
- SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR', 'SEMANTIC_SENSOR']
- HEIGHT: 0.88
- RADIUS: 0.18
- HABITAT_SIM_V0:
- GPU_DEVICE_ID: 0
- ALLOW_SLIDING: True
- SEMANTIC_SENSOR:
- WIDTH: 640
- HEIGHT: 480
- HFOV: 79
- POSITION: [0, 0.88, 0]
- RGB_SENSOR:
- WIDTH: 640
- HEIGHT: 480
- HFOV: 79
- POSITION: [0, 0.88, 0]
- DEPTH_SENSOR:
- WIDTH: 640
- HEIGHT: 480
- HFOV: 79
- MIN_DEPTH: 0.5
- MAX_DEPTH: 5.0
- POSITION: [0, 0.88, 0]
-TASK:
- TYPE: ObjectNav-v1
- POSSIBLE_ACTIONS: ["STOP", "MOVE_FORWARD", "TURN_LEFT", "TURN_RIGHT", "LOOK_UP", "LOOK_DOWN"]
- SENSORS: ['GPS_SENSOR', 'COMPASS_SENSOR']
- MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL']
- SUCCESS:
- SUCCESS_DISTANCE: 0.2
-
-DATASET:
- TYPE: PointNav-v1
- SPLIT: train
- DATA_PATH: "data/datasets/objectnav/gibson/v1/{split}/{split}.json.gz"
- EPISODES_DIR: "data/datasets/objectnav/gibson/v1/{split}/"
- SCENES_DIR: "data/scene_datasets/"
diff --git a/envs/habitat/objectgoal_env.py b/envs/habitat/objectgoal_env.py
deleted file mode 100644
index a08dd55..0000000
--- a/envs/habitat/objectgoal_env.py
+++ /dev/null
@@ -1,465 +0,0 @@
-import json
-import bz2
-import gzip
-import _pickle as cPickle
-import gym
-import numpy as np
-import quaternion
-import skimage.morphology
-import habitat
-
-from envs.utils.fmm_planner import FMMPlanner
-from constants import coco_categories
-import envs.utils.pose as pu
-
-
-class ObjectGoal_Env(habitat.RLEnv):
- """The Object Goal Navigation environment class. The class is responsible
- for loading the dataset, generating episodes, and computing evaluation
- metrics.
- """
-
- def __init__(self, args, rank, config_env, dataset):
- self.args = args
- self.rank = rank
-
- super().__init__(config_env, dataset)
-
- # Loading dataset info file
- self.split = config_env.DATASET.SPLIT
- self.episodes_dir = config_env.DATASET.EPISODES_DIR.format(
- split=self.split)
-
- dataset_info_file = self.episodes_dir + \
- "{split}_info.pbz2".format(split=self.split)
- with bz2.BZ2File(dataset_info_file, 'rb') as f:
- self.dataset_info = cPickle.load(f)
-
- # Specifying action and observation space
- self.action_space = gym.spaces.Discrete(3)
-
- self.observation_space = gym.spaces.Box(0, 255,
- (3, args.frame_height,
- args.frame_width),
- dtype='uint8')
-
- # Initializations
- self.episode_no = 0
-
- # Scene info
- self.last_scene_path = None
- self.scene_path = None
- self.scene_name = None
-
- # Episode Dataset info
- self.eps_data = None
- self.eps_data_idx = None
- self.gt_planner = None
- self.object_boundary = None
- self.goal_idx = None
- self.goal_name = None
- self.map_obj_origin = None
- self.starting_loc = None
- self.starting_distance = None
-
- # Episode tracking info
- self.curr_distance = None
- self.prev_distance = None
- self.timestep = None
- self.stopped = None
- self.path_length = None
- self.last_sim_location = None
- self.trajectory_states = []
- self.info = {}
- self.info['distance_to_goal'] = None
- self.info['spl'] = None
- self.info['success'] = None
-
- def load_new_episode(self):
- """The function loads a fixed episode from the episode dataset. This
- function is used for evaluating a trained model on the val split.
- """
-
- args = self.args
- self.scene_path = self.habitat_env.sim.config.SCENE
- scene_name = self.scene_path.split("/")[-1].split(".")[0]
-
- if self.scene_path != self.last_scene_path:
- episodes_file = self.episodes_dir + \
- "content/{}_episodes.json.gz".format(scene_name)
-
- print("Loading episodes from: {}".format(episodes_file))
- with gzip.open(episodes_file, 'r') as f:
- self.eps_data = json.loads(
- f.read().decode('utf-8'))["episodes"]
-
- self.eps_data_idx = 0
- self.last_scene_path = self.scene_path
-
- # Load episode info
- episode = self.eps_data[self.eps_data_idx]
- self.eps_data_idx += 1
- self.eps_data_idx = self.eps_data_idx % len(self.eps_data)
- pos = episode["start_position"]
- rot = quaternion.from_float_array(episode["start_rotation"])
-
- goal_name = episode["object_category"]
- goal_idx = episode["object_id"]
- floor_idx = episode["floor_id"]
-
- # Load scene info
- scene_info = self.dataset_info[scene_name]
- sem_map = scene_info[floor_idx]['sem_map']
- map_obj_origin = scene_info[floor_idx]['origin']
-
- # Setup ground truth planner
- object_boundary = args.success_dist
- map_resolution = args.map_resolution
- selem = skimage.morphology.disk(2)
- traversible = skimage.morphology.binary_dilation(
- sem_map[0], selem) != True
- traversible = 1 - traversible
- planner = FMMPlanner(traversible)
- selem = skimage.morphology.disk(
- int(object_boundary * 100. / map_resolution))
- goal_map = skimage.morphology.binary_dilation(
- sem_map[goal_idx + 1], selem) != True
- goal_map = 1 - goal_map
- planner.set_multi_goal(goal_map)
-
- # Get starting loc in GT map coordinates
- x = -pos[2]
- y = -pos[0]
- min_x, min_y = map_obj_origin / 100.0
- map_loc = int((-y - min_y) * 20.), int((-x - min_x) * 20.)
-
- self.gt_planner = planner
- self.starting_loc = map_loc
- self.object_boundary = object_boundary
- self.goal_idx = goal_idx
- self.goal_name = goal_name
- self.map_obj_origin = map_obj_origin
-
- self.starting_distance = self.gt_planner.fmm_dist[self.starting_loc]\
- / 20.0 + self.object_boundary
- self.prev_distance = self.starting_distance
- self._env.sim.set_agent_state(pos, rot)
-
- # The following two should match approximately
- # print(starting_loc)
- # print(self.sim_continuous_to_sim_map(self.get_sim_location()))
-
- obs = self._env.sim.get_observations_at(pos, rot)
-
- return obs
-
- def generate_new_episode(self):
- """The function generates a random valid episode. This function is used
- for training a model on the train split.
- """
-
- args = self.args
-
- self.scene_path = self.habitat_env.sim.config.SCENE
- scene_name = self.scene_path.split("/")[-1].split(".")[0]
-
- scene_info = self.dataset_info[scene_name]
- map_resolution = args.map_resolution
-
- floor_idx = np.random.randint(len(scene_info.keys()))
- floor_height = scene_info[floor_idx]['floor_height']
- sem_map = scene_info[floor_idx]['sem_map']
- map_obj_origin = scene_info[floor_idx]['origin']
-
- cat_counts = sem_map.sum(2).sum(1)
- possible_cats = list(np.arange(6))
-
- for i in range(6):
- if cat_counts[i + 1] == 0:
- possible_cats.remove(i)
-
- object_boundary = args.success_dist
-
- loc_found = False
- while not loc_found:
- if len(possible_cats) == 0:
- print("No valid objects for {}".format(floor_height))
- eps = eps - 1
- continue
-
- goal_idx = np.random.choice(possible_cats)
-
- for key, value in coco_categories.items():
- if value == goal_idx:
- goal_name = key
-
- selem = skimage.morphology.disk(2)
- traversible = skimage.morphology.binary_dilation(
- sem_map[0], selem) != True
- traversible = 1 - traversible
-
- planner = FMMPlanner(traversible)
-
- selem = skimage.morphology.disk(
- int(object_boundary * 100. / map_resolution))
- goal_map = skimage.morphology.binary_dilation(
- sem_map[goal_idx + 1], selem) != True
- goal_map = 1 - goal_map
-
- planner.set_multi_goal(goal_map)
-
- m1 = sem_map[0] > 0
- m2 = planner.fmm_dist > (args.min_d - object_boundary) * 20.0
- m3 = planner.fmm_dist < (args.max_d - object_boundary) * 20.0
-
- possible_starting_locs = np.logical_and(m1, m2)
- possible_starting_locs = np.logical_and(
- possible_starting_locs, m3) * 1.
- if possible_starting_locs.sum() != 0:
- loc_found = True
- else:
- print("Invalid object: {} / {} / {}".format(
- scene_name, floor_height, goal_name))
- possible_cats.remove(goal_idx)
- scene_info[floor_idx]["sem_map"][goal_idx + 1, :, :] = 0.
- self.dataset_info[scene_name][floor_idx][
- "sem_map"][goal_idx + 1, :, :] = 0.
-
- loc_found = False
- while not loc_found:
- pos = self._env.sim.sample_navigable_point()
- x = -pos[2]
- y = -pos[0]
- min_x, min_y = map_obj_origin / 100.0
- map_loc = int((-y - min_y) * 20.), int((-x - min_x) * 20.)
- if abs(pos[1] - floor_height) < args.floor_thr / 100.0 and \
- possible_starting_locs[map_loc[0], map_loc[1]] == 1:
- loc_found = True
-
- agent_state = self._env.sim.get_agent_state(0)
- rotation = agent_state.rotation
- rvec = quaternion.as_rotation_vector(rotation)
- rvec[1] = np.random.rand() * 2 * np.pi
- rot = quaternion.from_rotation_vector(rvec)
-
- self.gt_planner = planner
- self.starting_loc = map_loc
- self.object_boundary = object_boundary
- self.goal_idx = goal_idx
- self.goal_name = goal_name
- self.map_obj_origin = map_obj_origin
-
- self.starting_distance = self.gt_planner.fmm_dist[self.starting_loc] \
- / 20.0 + self.object_boundary
- self.prev_distance = self.starting_distance
-
- self._env.sim.set_agent_state(pos, rot)
-
- # The following two should match approximately
- # print(starting_loc)
- # print(self.sim_continuous_to_sim_map(self.get_sim_location()))
-
- obs = self._env.sim.get_observations_at(pos, rot)
-
- return obs
-
- def sim_map_to_sim_continuous(self, coords):
- """Converts ground-truth 2D Map coordinates to absolute Habitat
- simulator position and rotation.
- """
- agent_state = self._env.sim.get_agent_state(0)
- y, x = coords
- min_x, min_y = self.map_obj_origin / 100.0
-
- cont_x = x / 20. + min_x
- cont_y = y / 20. + min_y
- agent_state.position[0] = cont_y
- agent_state.position[2] = cont_x
-
- rotation = agent_state.rotation
- rvec = quaternion.as_rotation_vector(rotation)
-
- if self.args.train_single_eps:
- rvec[1] = 0.0
- else:
- rvec[1] = np.random.rand() * 2 * np.pi
- rot = quaternion.from_rotation_vector(rvec)
-
- return agent_state.position, rot
-
- def sim_continuous_to_sim_map(self, sim_loc):
- """Converts absolute Habitat simulator pose to ground-truth 2D Map
- coordinates.
- """
- x, y, o = sim_loc
- min_x, min_y = self.map_obj_origin / 100.0
- x, y = int((-x - min_x) * 20.), int((-y - min_y) * 20.)
-
- o = np.rad2deg(o) + 180.0
- return y, x, o
-
- def reset(self):
- """Resets the environment to a new episode.
-
- Returns:
- obs (ndarray): RGBD observations (4 x H x W)
- info (dict): contains timestep, pose, goal category and
- evaluation metric info
- """
- args = self.args
- new_scene = self.episode_no % args.num_train_episodes == 0
-
- self.episode_no += 1
-
- # Initializations
- self.timestep = 0
- self.stopped = False
- self.path_length = 1e-5
- self.trajectory_states = []
-
- if new_scene:
- obs = super().reset()
- self.scene_name = self.habitat_env.sim.config.SCENE
- print("Changing scene: {}/{}".format(self.rank, self.scene_name))
-
- self.scene_path = self.habitat_env.sim.config.SCENE
-
- if self.split == "val":
- obs = self.load_new_episode()
- else:
- obs = self.generate_new_episode()
-
- rgb = obs['rgb'].astype(np.uint8)
- depth = obs['depth']
- state = np.concatenate((rgb, depth), axis=2).transpose(2, 0, 1)
- self.last_sim_location = self.get_sim_location()
-
- # Set info
- self.info['time'] = self.timestep
- self.info['sensor_pose'] = [0., 0., 0.]
- self.info['goal_cat_id'] = self.goal_idx
- self.info['goal_name'] = self.goal_name
-
- return state, self.info
-
- def step(self, action):
- """Function to take an action in the environment.
-
- Args:
- action (dict):
- dict with following keys:
- 'action' (int): 0: stop, 1: forward, 2: left, 3: right
-
- Returns:
- obs (ndarray): RGBD observations (4 x H x W)
- reward (float): amount of reward returned after previous action
- done (bool): whether the episode has ended
- info (dict): contains timestep, pose, goal category and
- evaluation metric info
- """
- action = action["action"]
- if action == 0:
- self.stopped = True
- # Not sending stop to simulator, resetting manually
- action = 3
-
- obs, rew, done, _ = super().step(action)
-
- # Get pose change
- dx, dy, do = self.get_pose_change()
- self.info['sensor_pose'] = [dx, dy, do]
- self.path_length += pu.get_l2_distance(0, dx, 0, dy)
-
- spl, success, dist = 0., 0., 0.
- if done:
- spl, success, dist = self.get_metrics()
- self.info['distance_to_goal'] = dist
- self.info['spl'] = spl
- self.info['success'] = success
-
- rgb = obs['rgb'].astype(np.uint8)
- depth = obs['depth']
- state = np.concatenate((rgb, depth), axis=2).transpose(2, 0, 1)
-
- self.timestep += 1
- self.info['time'] = self.timestep
-
- return state, rew, done, self.info
-
- def get_reward_range(self):
- """This function is not used, Habitat-RLEnv requires this function"""
- return (0., 1.0)
-
- def get_reward(self, observations):
- curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location())
- self.curr_distance = self.gt_planner.fmm_dist[curr_loc[0],
- curr_loc[1]] / 20.0
-
- reward = (self.prev_distance - self.curr_distance) * \
- self.args.reward_coeff
-
- self.prev_distance = self.curr_distance
- return reward
-
- def get_metrics(self):
- """This function computes evaluation metrics for the Object Goal task
-
- Returns:
- spl (float): Success weighted by Path Length
- (See https://arxiv.org/pdf/1807.06757.pdf)
- success (int): 0: Failure, 1: Successful
- dist (float): Distance to Success (DTS), distance of the agent
- from the success threshold boundary in meters.
- (See https://arxiv.org/pdf/2007.00643.pdf)
- """
- curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location())
- dist = self.gt_planner.fmm_dist[curr_loc[0], curr_loc[1]] / 20.0
- if dist == 0.0:
- success = 1
- else:
- success = 0
- spl = min(success * self.starting_distance / self.path_length, 1)
- return spl, success, dist
-
- def get_done(self, observations):
- if self.info['time'] >= self.args.max_episode_length - 1:
- done = True
- elif self.stopped:
- done = True
- else:
- done = False
- return done
-
- def get_info(self, observations):
- """This function is not used, Habitat-RLEnv requires this function"""
- info = {}
- return info
-
- def get_spaces(self):
- """Returns observation and action spaces for the ObjectGoal task."""
- return self.observation_space, self.action_space
-
- def get_sim_location(self):
- """Returns x, y, o pose of the agent in the Habitat simulator."""
-
- agent_state = super().habitat_env.sim.get_agent_state(0)
- x = -agent_state.position[2]
- y = -agent_state.position[0]
- axis = quaternion.as_euler_angles(agent_state.rotation)[0]
- if (axis % (2 * np.pi)) < 0.1 or (axis %
- (2 * np.pi)) > 2 * np.pi - 0.1:
- o = quaternion.as_euler_angles(agent_state.rotation)[1]
- else:
- o = 2 * np.pi - quaternion.as_euler_angles(agent_state.rotation)[1]
- if o > np.pi:
- o -= 2 * np.pi
- return x, y, o
-
- def get_pose_change(self):
- """Returns dx, dy, do pose change of the agent relative to the last
- timestep."""
- curr_sim_pose = self.get_sim_location()
- dx, dy, do = pu.get_rel_pose_change(
- curr_sim_pose, self.last_sim_location)
- self.last_sim_location = curr_sim_pose
- return dx, dy, do
diff --git a/envs/habitat/utils/vector_env.py b/envs/habitat/utils/vector_env.py
deleted file mode 100644
index 389300a..0000000
--- a/envs/habitat/utils/vector_env.py
+++ /dev/null
@@ -1,586 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from multiprocessing.connection import Connection
-from multiprocessing.context import BaseContext
-from queue import Queue
-from threading import Thread
-from typing import (
- Any,
- Callable,
- Dict,
- List,
- Optional,
- Sequence,
- Set,
- Tuple,
- Union,
-)
-
-import gym
-import numpy as np
-from gym.spaces.dict_space import Dict as SpaceDict
-
-import habitat
-from habitat.config import Config
-from habitat.core.env import Env, Observations, RLEnv
-from habitat.core.logging import logger
-from habitat.core.utils import tile_images
-
-try:
- # Use torch.multiprocessing if we can.
- # We have yet to find a reason to not use it and
- # you are required to use it when sending a torch.Tensor
- # between processes
- import torch.multiprocessing as mp
-except ImportError:
- import multiprocessing as mp
-
-STEP_COMMAND = "step"
-RESET_COMMAND = "reset"
-RENDER_COMMAND = "render"
-CLOSE_COMMAND = "close"
-OBSERVATION_SPACE_COMMAND = "observation_space"
-ACTION_SPACE_COMMAND = "action_space"
-CALL_COMMAND = "call"
-EPISODE_COMMAND = "current_episode"
-PLAN_ACT_AND_PREPROCESS = "plan_act_and_preprocess"
-COUNT_EPISODES_COMMAND = "count_episodes"
-EPISODE_OVER = "episode_over"
-GET_METRICS = "get_metrics"
-
-
-def _make_env_fn(
- config: Config, dataset: Optional[habitat.Dataset] = None, rank: int = 0
-) -> Env:
- """Constructor for default habitat `env.Env`.
-
- :param config: configuration for environment.
- :param dataset: dataset for environment.
- :param rank: rank for setting seed of environment
- :return: `env.Env` / `env.RLEnv` object
- """
- habitat_env = Env(config=config, dataset=dataset)
- habitat_env.seed(config.SEED + rank)
- return habitat_env
-
-
-class VectorEnv:
- r"""Vectorized environment which creates multiple processes where each
- process runs its own environment. Main class for parallelization of
- training and evaluation.
-
-
- All the environments are synchronized on step and reset methods.
- """
-
- observation_spaces: List[SpaceDict]
- action_spaces: List[SpaceDict]
- _workers: List[Union[mp.Process, Thread]]
- _is_waiting: bool
- _num_envs: int
- _auto_reset_done: bool
- _mp_ctx: BaseContext
- _connection_read_fns: List[Callable[[], Any]]
- _connection_write_fns: List[Callable[[Any], None]]
-
- def __init__(
- self,
- make_env_fn: Callable[..., Union[Env, RLEnv]] = _make_env_fn,
- env_fn_args: Sequence[Tuple] = None,
- auto_reset_done: bool = True,
- multiprocessing_start_method: str = "forkserver",
- ) -> None:
- """..
-
- :param make_env_fn: function which creates a single environment. An
- environment can be of type `env.Env` or `env.RLEnv`
- :param env_fn_args: tuple of tuple of args to pass to the
- `_make_env_fn`.
- :param auto_reset_done: automatically reset the environment when
- done. This functionality is provided for seamless training
- of vectorized environments.
- :param multiprocessing_start_method: the multiprocessing method used to
- spawn worker processes. Valid methods are
- :py:`{'spawn', 'forkserver', 'fork'}`; :py:`'forkserver'` is the
- recommended method as it works well with CUDA. If :py:`'fork'` is
- used, the subproccess must be started before any other GPU useage.
- """
- self._is_waiting = False
- self._is_closed = True
-
- assert (
- env_fn_args is not None and len(env_fn_args) > 0
- ), "number of environments to be created should be greater than 0"
-
- self._num_envs = len(env_fn_args)
-
- assert multiprocessing_start_method in self._valid_start_methods, (
- "multiprocessing_start_method must be one of {}. Got '{}'"
- ).format(self._valid_start_methods, multiprocessing_start_method)
- self._auto_reset_done = auto_reset_done
- self._mp_ctx = mp.get_context(multiprocessing_start_method)
- self._workers = []
- (
- self._connection_read_fns,
- self._connection_write_fns,
- ) = self._spawn_workers( # noqa
- env_fn_args, make_env_fn
- )
-
- self._is_closed = False
-
- for write_fn in self._connection_write_fns:
- write_fn((OBSERVATION_SPACE_COMMAND, None))
- self.observation_spaces = [
- read_fn() for read_fn in self._connection_read_fns
- ]
- for write_fn in self._connection_write_fns:
- write_fn((ACTION_SPACE_COMMAND, None))
- self.action_spaces = [
- read_fn() for read_fn in self._connection_read_fns
- ]
- self.observation_space = self.observation_spaces[0]
- self.action_space = self.action_spaces[0]
- self._paused = []
-
- @property
- def num_envs(self):
- r"""number of individual environments.
- """
- return self._num_envs - len(self._paused)
-
- @staticmethod
- def _worker_env(
- connection_read_fn: Callable,
- connection_write_fn: Callable,
- env_fn: Callable,
- env_fn_args: Tuple[Any],
- auto_reset_done: bool,
- child_pipe: Optional[Connection] = None,
- parent_pipe: Optional[Connection] = None,
- ) -> None:
- r"""process worker for creating and interacting with the environment.
- """
- env = env_fn(*env_fn_args)
- if parent_pipe is not None:
- parent_pipe.close()
- try:
- command, data = connection_read_fn()
- while command != CLOSE_COMMAND:
- if command == STEP_COMMAND:
- # different step methods for habitat.RLEnv and habitat.Env
- if isinstance(env, habitat.RLEnv) or isinstance(
- env, gym.Env
- ):
- # habitat.RLEnv
- observations, reward, done, info = env.step(**data)
- if auto_reset_done and done:
- observations, info = env.reset()
- connection_write_fn((observations, reward, done, info))
- elif isinstance(env, habitat.Env):
- # habitat.Env
- observations = env.step(**data)
- if auto_reset_done and env.episode_over:
- observations = env.reset()
- connection_write_fn(observations)
- else:
- raise NotImplementedError
-
- elif command == RESET_COMMAND:
- observations = env.reset()
- connection_write_fn(observations)
-
- elif command == RENDER_COMMAND:
- connection_write_fn(env.render(*data[0], **data[1]))
-
- elif (
- command == OBSERVATION_SPACE_COMMAND
- or command == ACTION_SPACE_COMMAND
- ):
- if isinstance(command, str):
- connection_write_fn(getattr(env, command))
-
- elif command == CALL_COMMAND:
- function_name, function_args = data
- if function_args is None or len(function_args) == 0:
- result = getattr(env, function_name)()
- else:
- result = getattr(env, function_name)(**function_args)
- connection_write_fn(result)
-
- # TODO: update CALL_COMMAND for getting attribute like this
- elif command == EPISODE_COMMAND:
- connection_write_fn(env.current_episode)
-
- elif command == PLAN_ACT_AND_PREPROCESS:
- observations, reward, done, info = \
- env.plan_act_and_preprocess(data)
- if auto_reset_done and done:
- observations, info = env.reset()
- connection_write_fn((observations, reward, done, info))
-
- elif command == COUNT_EPISODES_COMMAND:
- connection_write_fn(len(env.episodes))
-
- elif command == EPISODE_OVER:
- connection_write_fn(env.episode_over)
-
- elif command == GET_METRICS:
- result = env.get_metrics()
- connection_write_fn(result)
-
- else:
- raise NotImplementedError
-
- command, data = connection_read_fn()
-
- if child_pipe is not None:
- child_pipe.close()
- except KeyboardInterrupt:
- logger.info("Worker KeyboardInterrupt")
- finally:
- env.close()
-
- def _spawn_workers(
- self,
- env_fn_args: Sequence[Tuple],
- make_env_fn: Callable[..., Union[Env, RLEnv]] = _make_env_fn,
- ) -> Tuple[List[Callable[[], Any]], List[Callable[[Any], None]]]:
- parent_connections, worker_connections = zip(
- *[self._mp_ctx.Pipe(duplex=True) for _ in range(self._num_envs)]
- )
- self._workers = []
- for worker_conn, parent_conn, env_args in zip(
- worker_connections, parent_connections, env_fn_args
- ):
- ps = self._mp_ctx.Process(
- target=self._worker_env,
- args=(
- worker_conn.recv,
- worker_conn.send,
- make_env_fn,
- env_args,
- self._auto_reset_done,
- worker_conn,
- parent_conn,
- ),
- )
- self._workers.append(ps)
- ps.daemon = True
- ps.start()
- worker_conn.close()
- return (
- [p.recv for p in parent_connections],
- [p.send for p in parent_connections],
- )
-
- def current_episodes(self):
- self._is_waiting = True
- for write_fn in self._connection_write_fns:
- write_fn((EPISODE_COMMAND, None))
- results = []
- for read_fn in self._connection_read_fns:
- results.append(read_fn())
- self._is_waiting = False
- return results
-
- def count_episodes(self):
- self._is_waiting = True
- for write_fn in self._connection_write_fns:
- write_fn((COUNT_EPISODES_COMMAND, None))
- results = []
- for read_fn in self._connection_read_fns:
- results.append(read_fn())
- self._is_waiting = False
- return results
-
- def episode_over(self):
- self._is_waiting = True
- for write_fn in self._connection_write_fns:
- write_fn((EPISODE_OVER, None))
- results = []
- for read_fn in self._connection_read_fns:
- results.append(read_fn())
- self._is_waiting = False
- return results
-
- def get_metrics(self):
- self._is_waiting = True
- for write_fn in self._connection_write_fns:
- write_fn((GET_METRICS, None))
- results = []
- for read_fn in self._connection_read_fns:
- results.append(read_fn())
- self._is_waiting = False
- return results
-
- def reset(self):
- r"""Reset all the vectorized environments
-
- :return: list of outputs from the reset method of envs.
- """
- self._is_waiting = True
- for write_fn in self._connection_write_fns:
- write_fn((RESET_COMMAND, None))
- results = []
- for read_fn in self._connection_read_fns:
- results.append(read_fn())
- obs, infos = zip(*results)
-
- self._is_waiting = False
- return np.stack(obs), infos
-
- def reset_at(self, index_env: int):
- r"""Reset in the index_env environment in the vector.
-
- :param index_env: index of the environment to be reset
- :return: list containing the output of reset method of indexed env.
- """
- self._is_waiting = True
- self._connection_write_fns[index_env]((RESET_COMMAND, None))
- results = [self._connection_read_fns[index_env]()]
- self._is_waiting = False
- return results
-
- def step_at(self, index_env: int, action: Dict[str, Any]):
- r"""Step in the index_env environment in the vector.
-
- :param index_env: index of the environment to be stepped into
- :param action: action to be taken
- :return: list containing the output of step method of indexed env.
- """
- self._is_waiting = True
- self._connection_write_fns[index_env]((STEP_COMMAND, action))
- results = [self._connection_read_fns[index_env]()]
- self._is_waiting = False
- return results
-
- def step_async(self, data: List[Union[int, str, Dict[str, Any]]]) -> None:
- r"""Asynchronously step in the environments.
-
- :param data: list of size _num_envs containing keyword arguments to
- pass to `step` method for each Environment. For example,
- :py:`[{"action": "TURN_LEFT", "action_args": {...}}, ...]`.
- """
- # Backward compatibility
- if isinstance(data[0], (int, np.integer, str)):
- data = [{"action": {"action": action}} for action in data]
-
- self._is_waiting = True
- for write_fn, args in zip(self._connection_write_fns, data):
- write_fn((STEP_COMMAND, args))
-
- def step_wait(self) -> List[Observations]:
- r"""Wait until all the asynchronized environments have synchronized.
- """
- results = []
- for read_fn in self._connection_read_fns:
- results.append(read_fn())
- self._is_waiting = False
- obs, rews, dones, infos = zip(*results)
- return np.stack(obs), np.stack(rews), np.stack(dones), infos
-
- def step(self, data: List[Union[int, str, Dict[str, Any]]]) -> List[Any]:
- r"""Perform actions in the vectorized environments.
-
- :param data: list of size _num_envs containing keyword arguments to
- pass to `step` method for each Environment. For example,
- :py:`[{"action": "TURN_LEFT", "action_args": {...}}, ...]`.
- :return: list of outputs from the step method of envs.
- """
- self.step_async(data)
- return self.step_wait()
-
- def close(self) -> None:
- if self._is_closed:
- return
-
- if self._is_waiting:
- for read_fn in self._connection_read_fns:
- read_fn()
-
- for write_fn in self._connection_write_fns:
- write_fn((CLOSE_COMMAND, None))
-
- for _, _, write_fn, _ in self._paused:
- write_fn((CLOSE_COMMAND, None))
-
- for process in self._workers:
- process.join()
-
- for _, _, _, process in self._paused:
- process.join()
-
- self._is_closed = True
-
- def pause_at(self, index: int) -> None:
- r"""Pauses computation on this env without destroying the env.
-
- :param index: which env to pause. All indexes after this one will be
- shifted down by one.
-
- This is useful for not needing to call steps on all environments when
- only some are active (for example during the last episodes of running
- eval episodes).
- """
- if self._is_waiting:
- for read_fn in self._connection_read_fns:
- read_fn()
- read_fn = self._connection_read_fns.pop(index)
- write_fn = self._connection_write_fns.pop(index)
- worker = self._workers.pop(index)
- self._paused.append((index, read_fn, write_fn, worker))
-
- def resume_all(self) -> None:
- r"""Resumes any paused envs.
- """
- for index, read_fn, write_fn, worker in reversed(self._paused):
- self._connection_read_fns.insert(index, read_fn)
- self._connection_write_fns.insert(index, write_fn)
- self._workers.insert(index, worker)
- self._paused = []
-
- def call_at(
- self,
- index: int,
- function_name: str,
- function_args: Optional[Dict[str, Any]] = None,
- ) -> Any:
- r"""Calls a function (which is passed by name) on the selected env and
- returns the result.
-
- :param index: which env to call the function on.
- :param function_name: the name of the function to call on the env.
- :param function_args: optional function args.
- :return: result of calling the function.
- """
- self._is_waiting = True
- self._connection_write_fns[index](
- (CALL_COMMAND, (function_name, function_args))
- )
- result = self._connection_read_fns[index]()
- self._is_waiting = False
- return result
-
- def call(
- self,
- function_names: List[str],
- function_args_list: Optional[List[Any]] = None,
- ) -> List[Any]:
- r"""Calls a list of functions (which are passed by name) on the
- corresponding env (by index).
-
- :param function_names: the name of the functions to call on the envs.
- :param function_args_list: list of function args for each function. If
- provided, :py:`len(function_args_list)` should be as long as
- :py:`len(function_names)`.
- :return: result of calling the function.
- """
- self._is_waiting = True
- if function_args_list is None:
- function_args_list = [None] * len(function_names)
- assert len(function_names) == len(function_args_list)
- func_args = zip(function_names, function_args_list)
- for write_fn, func_args_on in zip(
- self._connection_write_fns, func_args
- ):
- write_fn((CALL_COMMAND, func_args_on))
- results = []
- for read_fn in self._connection_read_fns:
- results.append(read_fn())
- self._is_waiting = False
- return results
-
- def render(
- self, mode: str = "human", *args, **kwargs
- ) -> Union[np.ndarray, None]:
- r"""Render observations from all environments in a tiled image.
- """
- for write_fn in self._connection_write_fns:
- write_fn((RENDER_COMMAND, (args, {"mode": "rgb", **kwargs})))
- images = [read_fn() for read_fn in self._connection_read_fns]
- tile = tile_images(images)
- if mode == "human":
- from habitat.core.utils import try_cv2_import
-
- cv2 = try_cv2_import()
-
- cv2.imshow("vecenv", tile[:, :, ::-1])
- cv2.waitKey(1)
- return None
- elif mode == "rgb_array":
- return tile
- else:
- raise NotImplementedError
-
- def plan_act_and_preprocess(self, inputs):
- self._assert_not_closed()
- self._is_waiting = True
- for e, write_fn in enumerate(self._connection_write_fns):
- write_fn((PLAN_ACT_AND_PREPROCESS, inputs[e]))
- results = []
- for read_fn in self._connection_read_fns:
- results.append(read_fn())
- obs, rews, dones, infos = zip(*results)
- self._is_waiting = False
- return np.stack(obs), np.stack(rews), np.stack(dones), infos
-
- def _assert_not_closed(self):
- assert not self._is_closed, "Trying to operate on a SubprocVecEnv after calling close()"
-
- @property
- def _valid_start_methods(self) -> Set[str]:
- return {"forkserver", "spawn", "fork"}
-
- def __del__(self):
- self.close()
-
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_val, exc_tb):
- self.close()
-
-
-class ThreadedVectorEnv(VectorEnv):
- r"""Provides same functionality as `VectorEnv`, the only difference is it
- runs in a multi-thread setup inside a single process.
-
- `VectorEnv` runs in a multi-proc setup. This makes it much easier to debug
- when using `VectorEnv` because you can actually put break points in the
- environment methods. It should not be used for best performance.
- """
-
- def _spawn_workers(
- self,
- env_fn_args: Sequence[Tuple],
- make_env_fn: Callable[..., Env] = _make_env_fn,
- ) -> Tuple[List[Callable[[], Any]], List[Callable[[Any], None]]]:
- parent_read_queues, parent_write_queues = zip(
- *[(Queue(), Queue()) for _ in range(self._num_envs)]
- )
- self._workers = []
- for parent_read_queue, parent_write_queue, env_args in zip(
- parent_read_queues, parent_write_queues, env_fn_args
- ):
- thread = Thread(
- target=self._worker_env,
- args=(
- parent_write_queue.get,
- parent_read_queue.put,
- make_env_fn,
- env_args,
- self._auto_reset_done,
- ),
- )
- self._workers.append(thread)
- thread.daemon = True
- thread.start()
- return (
- [q.get for q in parent_read_queues],
- [q.put for q in parent_write_queues],
- )
diff --git a/main.py b/main.py
deleted file mode 100755
index 437c8ad..0000000
--- a/main.py
+++ /dev/null
@@ -1,695 +0,0 @@
-from collections import deque, defaultdict
-import os
-import logging
-import time
-import json
-import gym
-import torch.nn as nn
-import torch
-import numpy as np
-
-from model import RL_Policy, Semantic_Mapping
-from utils.storage import GlobalRolloutStorage
-from envs import make_vec_envs
-from arguments import get_args
-import algo
-
-os.environ["OMP_NUM_THREADS"] = "1"
-
-
-def main():
- args = get_args()
-
- np.random.seed(args.seed)
- torch.manual_seed(args.seed)
-
- if args.cuda:
- torch.cuda.manual_seed(args.seed)
-
- # Setup Logging
- log_dir = "{}/models/{}/".format(args.dump_location, args.exp_name)
- dump_dir = "{}/dump/{}/".format(args.dump_location, args.exp_name)
-
- if not os.path.exists(log_dir):
- os.makedirs(log_dir)
- if not os.path.exists(dump_dir):
- os.makedirs(dump_dir)
-
- logging.basicConfig(
- filename=log_dir + 'train.log',
- level=logging.INFO)
- print("Dumping at {}".format(log_dir))
- print(args)
- logging.info(args)
-
- # Logging and loss variables
- num_scenes = args.num_processes
- num_episodes = int(args.num_eval_episodes)
- device = args.device = torch.device("cuda:0" if args.cuda else "cpu")
-
- g_masks = torch.ones(num_scenes).float().to(device)
-
- best_g_reward = -np.inf
-
- if args.eval:
- episode_success = []
- episode_spl = []
- episode_dist = []
- for _ in range(args.num_processes):
- episode_success.append(deque(maxlen=num_episodes))
- episode_spl.append(deque(maxlen=num_episodes))
- episode_dist.append(deque(maxlen=num_episodes))
-
- else:
- episode_success = deque(maxlen=1000)
- episode_spl = deque(maxlen=1000)
- episode_dist = deque(maxlen=1000)
-
- finished = np.zeros((args.num_processes))
- wait_env = np.zeros((args.num_processes))
-
- g_episode_rewards = deque(maxlen=1000)
-
- g_value_losses = deque(maxlen=1000)
- g_action_losses = deque(maxlen=1000)
- g_dist_entropies = deque(maxlen=1000)
-
- per_step_g_rewards = deque(maxlen=1000)
-
- g_process_rewards = np.zeros((num_scenes))
-
- # Starting environments
- torch.set_num_threads(1)
- envs = make_vec_envs(args)
- obs, infos = envs.reset()
-
- torch.set_grad_enabled(False)
-
- # Initialize map variables:
- # Full map consists of multiple channels containing the following:
- # 1. Obstacle Map
- # 2. Exploread Area
- # 3. Current Agent Location
- # 4. Past Agent Locations
- # 5,6,7,.. : Semantic Categories
- nc = args.num_sem_categories + 4 # num channels
-
- # Calculating full and local map sizes
- map_size = args.map_size_cm // args.map_resolution
- full_w, full_h = map_size, map_size
- local_w = int(full_w / args.global_downscaling)
- local_h = int(full_h / args.global_downscaling)
-
- # Initializing full and local map
- full_map = torch.zeros(num_scenes, nc, full_w, full_h).float().to(device)
- local_map = torch.zeros(num_scenes, nc, local_w,
- local_h).float().to(device)
-
- # Initial full and local pose
- full_pose = torch.zeros(num_scenes, 3).float().to(device)
- local_pose = torch.zeros(num_scenes, 3).float().to(device)
-
- # Origin of local map
- origins = np.zeros((num_scenes, 3))
-
- # Local Map Boundaries
- lmb = np.zeros((num_scenes, 4)).astype(int)
-
- # Planner pose inputs has 7 dimensions
- # 1-3 store continuous global agent location
- # 4-7 store local map boundaries
- planner_pose_inputs = np.zeros((num_scenes, 7))
-
- def get_local_map_boundaries(agent_loc, local_sizes, full_sizes):
- loc_r, loc_c = agent_loc
- local_w, local_h = local_sizes
- full_w, full_h = full_sizes
-
- if args.global_downscaling > 1:
- gx1, gy1 = loc_r - local_w // 2, loc_c - local_h // 2
- gx2, gy2 = gx1 + local_w, gy1 + local_h
- if gx1 < 0:
- gx1, gx2 = 0, local_w
- if gx2 > full_w:
- gx1, gx2 = full_w - local_w, full_w
-
- if gy1 < 0:
- gy1, gy2 = 0, local_h
- if gy2 > full_h:
- gy1, gy2 = full_h - local_h, full_h
- else:
- gx1, gx2, gy1, gy2 = 0, full_w, 0, full_h
-
- return [gx1, gx2, gy1, gy2]
-
- def init_map_and_pose():
- full_map.fill_(0.)
- full_pose.fill_(0.)
- full_pose[:, :2] = args.map_size_cm / 100.0 / 2.0
-
- locs = full_pose.cpu().numpy()
- planner_pose_inputs[:, :3] = locs
- for e in range(num_scenes):
- r, c = locs[e, 1], locs[e, 0]
- loc_r, loc_c = [int(r * 100.0 / args.map_resolution),
- int(c * 100.0 / args.map_resolution)]
-
- full_map[e, 2:4, loc_r - 1:loc_r + 2, loc_c - 1:loc_c + 2] = 1.0
-
- lmb[e] = get_local_map_boundaries((loc_r, loc_c),
- (local_w, local_h),
- (full_w, full_h))
-
- planner_pose_inputs[e, 3:] = lmb[e]
- origins[e] = [lmb[e][2] * args.map_resolution / 100.0,
- lmb[e][0] * args.map_resolution / 100.0, 0.]
-
- for e in range(num_scenes):
- local_map[e] = full_map[e, :,
- lmb[e, 0]:lmb[e, 1],
- lmb[e, 2]:lmb[e, 3]]
- local_pose[e] = full_pose[e] - \
- torch.from_numpy(origins[e]).to(device).float()
-
- def init_map_and_pose_for_env(e):
- full_map[e].fill_(0.)
- full_pose[e].fill_(0.)
- full_pose[e, :2] = args.map_size_cm / 100.0 / 2.0
-
- locs = full_pose[e].cpu().numpy()
- planner_pose_inputs[e, :3] = locs
- r, c = locs[1], locs[0]
- loc_r, loc_c = [int(r * 100.0 / args.map_resolution),
- int(c * 100.0 / args.map_resolution)]
-
- full_map[e, 2:4, loc_r - 1:loc_r + 2, loc_c - 1:loc_c + 2] = 1.0
-
- lmb[e] = get_local_map_boundaries((loc_r, loc_c),
- (local_w, local_h),
- (full_w, full_h))
-
- planner_pose_inputs[e, 3:] = lmb[e]
- origins[e] = [lmb[e][2] * args.map_resolution / 100.0,
- lmb[e][0] * args.map_resolution / 100.0, 0.]
-
- local_map[e] = full_map[e, :, lmb[e, 0]:lmb[e, 1], lmb[e, 2]:lmb[e, 3]]
- local_pose[e] = full_pose[e] - \
- torch.from_numpy(origins[e]).to(device).float()
-
- def update_intrinsic_rew(e):
- prev_explored_area = full_map[e, 1].sum(1).sum(0)
- full_map[e, :, lmb[e, 0]:lmb[e, 1], lmb[e, 2]:lmb[e, 3]] = \
- local_map[e]
- curr_explored_area = full_map[e, 1].sum(1).sum(0)
- intrinsic_rews[e] = curr_explored_area - prev_explored_area
- intrinsic_rews[e] *= (args.map_resolution / 100.)**2 # to m^2
-
- init_map_and_pose()
-
- # Global policy observation space
- ngc = 8 + args.num_sem_categories
- es = 2
- g_observation_space = gym.spaces.Box(0, 1,
- (ngc,
- local_w,
- local_h), dtype='uint8')
-
- # Global policy action space
- g_action_space = gym.spaces.Box(low=0.0, high=0.99,
- shape=(2,), dtype=np.float32)
-
- # Global policy recurrent layer size
- g_hidden_size = args.global_hidden_size
-
- # Semantic Mapping
- sem_map_module = Semantic_Mapping(args).to(device)
- sem_map_module.eval()
-
- # Global policy
- g_policy = RL_Policy(g_observation_space.shape, g_action_space,
- model_type=1,
- base_kwargs={'recurrent': args.use_recurrent_global,
- 'hidden_size': g_hidden_size,
- 'num_sem_categories': ngc - 8
- }).to(device)
- g_agent = algo.PPO(g_policy, args.clip_param, args.ppo_epoch,
- args.num_mini_batch, args.value_loss_coef,
- args.entropy_coef, lr=args.lr, eps=args.eps,
- max_grad_norm=args.max_grad_norm)
-
- global_input = torch.zeros(num_scenes, ngc, local_w, local_h)
- global_orientation = torch.zeros(num_scenes, 1).long()
- intrinsic_rews = torch.zeros(num_scenes).to(device)
- extras = torch.zeros(num_scenes, 2)
-
- # Storage
- g_rollouts = GlobalRolloutStorage(args.num_global_steps,
- num_scenes, g_observation_space.shape,
- g_action_space, g_policy.rec_state_size,
- es).to(device)
-
- if args.load != "0":
- print("Loading model {}".format(args.load))
- state_dict = torch.load(args.load,
- map_location=lambda storage, loc: storage)
- g_policy.load_state_dict(state_dict)
-
- if args.eval:
- g_policy.eval()
-
- # Predict semantic map from frame 1
- poses = torch.from_numpy(np.asarray(
- [infos[env_idx]['sensor_pose'] for env_idx in range(num_scenes)])
- ).float().to(device)
-
- _, local_map, _, local_pose = \
- sem_map_module(obs, poses, local_map, local_pose)
-
- # Compute Global policy input
- locs = local_pose.cpu().numpy()
- global_input = torch.zeros(num_scenes, ngc, local_w, local_h)
- global_orientation = torch.zeros(num_scenes, 1).long()
-
- for e in range(num_scenes):
- r, c = locs[e, 1], locs[e, 0]
- loc_r, loc_c = [int(r * 100.0 / args.map_resolution),
- int(c * 100.0 / args.map_resolution)]
-
- local_map[e, 2:4, loc_r - 1:loc_r + 2, loc_c - 1:loc_c + 2] = 1.
- global_orientation[e] = int((locs[e, 2] + 180.0) / 5.)
-
- global_input[:, 0:4, :, :] = local_map[:, 0:4, :, :].detach()
- global_input[:, 4:8, :, :] = nn.MaxPool2d(args.global_downscaling)(
- full_map[:, 0:4, :, :])
- global_input[:, 8:, :, :] = local_map[:, 4:, :, :].detach()
- goal_cat_id = torch.from_numpy(np.asarray(
- [infos[env_idx]['goal_cat_id'] for env_idx
- in range(num_scenes)]))
-
- extras = torch.zeros(num_scenes, 2)
- extras[:, 0] = global_orientation[:, 0]
- extras[:, 1] = goal_cat_id
-
- g_rollouts.obs[0].copy_(global_input)
- g_rollouts.extras[0].copy_(extras)
-
- # Run Global Policy (global_goals = Long-Term Goal)
- g_value, g_action, g_action_log_prob, g_rec_states = \
- g_policy.act(
- g_rollouts.obs[0],
- g_rollouts.rec_states[0],
- g_rollouts.masks[0],
- extras=g_rollouts.extras[0],
- deterministic=False
- )
-
- cpu_actions = nn.Sigmoid()(g_action).cpu().numpy()
- global_goals = [[int(action[0] * local_w), int(action[1] * local_h)]
- for action in cpu_actions]
- global_goals = [[min(x, int(local_w - 1)), min(y, int(local_h - 1))]
- for x, y in global_goals]
-
- goal_maps = [np.zeros((local_w, local_h)) for _ in range(num_scenes)]
-
- for e in range(num_scenes):
- goal_maps[e][global_goals[e][0], global_goals[e][1]] = 1
-
- planner_inputs = [{} for e in range(num_scenes)]
- for e, p_input in enumerate(planner_inputs):
- p_input['map_pred'] = local_map[e, 0, :, :].cpu().numpy()
- p_input['exp_pred'] = local_map[e, 1, :, :].cpu().numpy()
- p_input['pose_pred'] = planner_pose_inputs[e]
- p_input['goal'] = goal_maps[e] # global_goals[e]
- p_input['new_goal'] = 1
- p_input['found_goal'] = 0
- p_input['wait'] = wait_env[e] or finished[e]
- if args.visualize or args.print_images:
- local_map[e, -1, :, :] = 1e-5
- p_input['sem_map_pred'] = local_map[e, 4:, :, :
- ].argmax(0).cpu().numpy()
-
- obs, _, done, infos = envs.plan_act_and_preprocess(planner_inputs)
-
- start = time.time()
- g_reward = 0
-
- torch.set_grad_enabled(False)
- spl_per_category = defaultdict(list)
- success_per_category = defaultdict(list)
-
- for step in range(args.num_training_frames // args.num_processes + 1):
- if finished.sum() == args.num_processes:
- break
-
- g_step = (step // args.num_local_steps) % args.num_global_steps
- l_step = step % args.num_local_steps
-
- # ------------------------------------------------------------------
- # Reinitialize variables when episode ends
- l_masks = torch.FloatTensor([0 if x else 1
- for x in done]).to(device)
- g_masks *= l_masks
-
- for e, x in enumerate(done):
- if x:
- spl = infos[e]['spl']
- success = infos[e]['success']
- dist = infos[e]['distance_to_goal']
- spl_per_category[infos[e]['goal_name']].append(spl)
- success_per_category[infos[e]['goal_name']].append(success)
- if args.eval:
- episode_success[e].append(success)
- episode_spl[e].append(spl)
- episode_dist[e].append(dist)
- if len(episode_success[e]) == num_episodes:
- finished[e] = 1
- else:
- episode_success.append(success)
- episode_spl.append(spl)
- episode_dist.append(dist)
- wait_env[e] = 1.
- update_intrinsic_rew(e)
- init_map_and_pose_for_env(e)
- # ------------------------------------------------------------------
-
- # ------------------------------------------------------------------
- # Semantic Mapping Module
- poses = torch.from_numpy(np.asarray(
- [infos[env_idx]['sensor_pose'] for env_idx
- in range(num_scenes)])
- ).float().to(device)
-
- _, local_map, _, local_pose = \
- sem_map_module(obs, poses, local_map, local_pose)
-
- locs = local_pose.cpu().numpy()
- planner_pose_inputs[:, :3] = locs + origins
- local_map[:, 2, :, :].fill_(0.) # Resetting current location channel
- for e in range(num_scenes):
- r, c = locs[e, 1], locs[e, 0]
- loc_r, loc_c = [int(r * 100.0 / args.map_resolution),
- int(c * 100.0 / args.map_resolution)]
- local_map[e, 2:4, loc_r - 2:loc_r + 3, loc_c - 2:loc_c + 3] = 1.
-
- # ------------------------------------------------------------------
-
- # ------------------------------------------------------------------
- # Global Policy
- if l_step == args.num_local_steps - 1:
- # For every global step, update the full and local maps
- for e in range(num_scenes):
- if wait_env[e] == 1: # New episode
- wait_env[e] = 0.
- else:
- update_intrinsic_rew(e)
-
- full_map[e, :, lmb[e, 0]:lmb[e, 1], lmb[e, 2]:lmb[e, 3]] = \
- local_map[e]
- full_pose[e] = local_pose[e] + \
- torch.from_numpy(origins[e]).to(device).float()
-
- locs = full_pose[e].cpu().numpy()
- r, c = locs[1], locs[0]
- loc_r, loc_c = [int(r * 100.0 / args.map_resolution),
- int(c * 100.0 / args.map_resolution)]
-
- lmb[e] = get_local_map_boundaries((loc_r, loc_c),
- (local_w, local_h),
- (full_w, full_h))
-
- planner_pose_inputs[e, 3:] = lmb[e]
- origins[e] = [lmb[e][2] * args.map_resolution / 100.0,
- lmb[e][0] * args.map_resolution / 100.0, 0.]
-
- local_map[e] = full_map[e, :,
- lmb[e, 0]:lmb[e, 1],
- lmb[e, 2]:lmb[e, 3]]
- local_pose[e] = full_pose[e] - \
- torch.from_numpy(origins[e]).to(device).float()
-
- locs = local_pose.cpu().numpy()
- for e in range(num_scenes):
- global_orientation[e] = int((locs[e, 2] + 180.0) / 5.)
- global_input[:, 0:4, :, :] = local_map[:, 0:4, :, :]
- global_input[:, 4:8, :, :] = \
- nn.MaxPool2d(args.global_downscaling)(
- full_map[:, 0:4, :, :])
- global_input[:, 8:, :, :] = local_map[:, 4:, :, :].detach()
- goal_cat_id = torch.from_numpy(np.asarray(
- [infos[env_idx]['goal_cat_id'] for env_idx
- in range(num_scenes)]))
- extras[:, 0] = global_orientation[:, 0]
- extras[:, 1] = goal_cat_id
-
- # Get exploration reward and metrics
- g_reward = torch.from_numpy(np.asarray(
- [infos[env_idx]['g_reward'] for env_idx in range(num_scenes)])
- ).float().to(device)
- g_reward += args.intrinsic_rew_coeff * intrinsic_rews.detach()
-
- g_process_rewards += g_reward.cpu().numpy()
- g_total_rewards = g_process_rewards * \
- (1 - g_masks.cpu().numpy())
- g_process_rewards *= g_masks.cpu().numpy()
- per_step_g_rewards.append(np.mean(g_reward.cpu().numpy()))
-
- if np.sum(g_total_rewards) != 0:
- for total_rew in g_total_rewards:
- if total_rew != 0:
- g_episode_rewards.append(total_rew)
-
- # Add samples to global policy storage
- if step == 0:
- g_rollouts.obs[0].copy_(global_input)
- g_rollouts.extras[0].copy_(extras)
- else:
- g_rollouts.insert(
- global_input, g_rec_states,
- g_action, g_action_log_prob, g_value,
- g_reward, g_masks, extras
- )
-
- # Sample long-term goal from global policy
- g_value, g_action, g_action_log_prob, g_rec_states = \
- g_policy.act(
- g_rollouts.obs[g_step + 1],
- g_rollouts.rec_states[g_step + 1],
- g_rollouts.masks[g_step + 1],
- extras=g_rollouts.extras[g_step + 1],
- deterministic=False
- )
- cpu_actions = nn.Sigmoid()(g_action).cpu().numpy()
- global_goals = [[int(action[0] * local_w),
- int(action[1] * local_h)]
- for action in cpu_actions]
- global_goals = [[min(x, int(local_w - 1)),
- min(y, int(local_h - 1))]
- for x, y in global_goals]
-
- g_reward = 0
- g_masks = torch.ones(num_scenes).float().to(device)
-
- # ------------------------------------------------------------------
-
- # ------------------------------------------------------------------
- # Update long-term goal if target object is found
- found_goal = [0 for _ in range(num_scenes)]
- goal_maps = [np.zeros((local_w, local_h)) for _ in range(num_scenes)]
-
- for e in range(num_scenes):
- goal_maps[e][global_goals[e][0], global_goals[e][1]] = 1
-
- for e in range(num_scenes):
- cn = infos[e]['goal_cat_id'] + 4
- if local_map[e, cn, :, :].sum() != 0.:
- cat_semantic_map = local_map[e, cn, :, :].cpu().numpy()
- cat_semantic_scores = cat_semantic_map
- cat_semantic_scores[cat_semantic_scores > 0] = 1.
- goal_maps[e] = cat_semantic_scores
- found_goal[e] = 1
- # ------------------------------------------------------------------
-
- # ------------------------------------------------------------------
- # Take action and get next observation
- planner_inputs = [{} for e in range(num_scenes)]
- for e, p_input in enumerate(planner_inputs):
- p_input['map_pred'] = local_map[e, 0, :, :].cpu().numpy()
- p_input['exp_pred'] = local_map[e, 1, :, :].cpu().numpy()
- p_input['pose_pred'] = planner_pose_inputs[e]
- p_input['goal'] = goal_maps[e] # global_goals[e]
- p_input['new_goal'] = l_step == args.num_local_steps - 1
- p_input['found_goal'] = found_goal[e]
- p_input['wait'] = wait_env[e] or finished[e]
- if args.visualize or args.print_images:
- local_map[e, -1, :, :] = 1e-5
- p_input['sem_map_pred'] = local_map[e, 4:, :,
- :].argmax(0).cpu().numpy()
-
- obs, _, done, infos = envs.plan_act_and_preprocess(planner_inputs)
- # ------------------------------------------------------------------
-
- # ------------------------------------------------------------------
- # Training
- torch.set_grad_enabled(True)
- if g_step % args.num_global_steps == args.num_global_steps - 1 \
- and l_step == args.num_local_steps - 1:
- if not args.eval:
- g_next_value = g_policy.get_value(
- g_rollouts.obs[-1],
- g_rollouts.rec_states[-1],
- g_rollouts.masks[-1],
- extras=g_rollouts.extras[-1]
- ).detach()
-
- g_rollouts.compute_returns(g_next_value, args.use_gae,
- args.gamma, args.tau)
- g_value_loss, g_action_loss, g_dist_entropy = \
- g_agent.update(g_rollouts)
- g_value_losses.append(g_value_loss)
- g_action_losses.append(g_action_loss)
- g_dist_entropies.append(g_dist_entropy)
- g_rollouts.after_update()
-
- torch.set_grad_enabled(False)
- # ------------------------------------------------------------------
-
- # ------------------------------------------------------------------
- # Logging
- if step % args.log_interval == 0:
- end = time.time()
- time_elapsed = time.gmtime(end - start)
- log = " ".join([
- "Time: {0:0=2d}d".format(time_elapsed.tm_mday - 1),
- "{},".format(time.strftime("%Hh %Mm %Ss", time_elapsed)),
- "num timesteps {},".format(step * num_scenes),
- "FPS {},".format(int(step * num_scenes / (end - start)))
- ])
-
- log += "\n\tRewards:"
-
- if len(g_episode_rewards) > 0:
- log += " ".join([
- " Global step mean/med rew:",
- "{:.4f}/{:.4f},".format(
- np.mean(per_step_g_rewards),
- np.median(per_step_g_rewards)),
- " Global eps mean/med/min/max eps rew:",
- "{:.3f}/{:.3f}/{:.3f}/{:.3f},".format(
- np.mean(g_episode_rewards),
- np.median(g_episode_rewards),
- np.min(g_episode_rewards),
- np.max(g_episode_rewards))
- ])
-
- if args.eval:
- total_success = []
- total_spl = []
- total_dist = []
- for e in range(args.num_processes):
- for acc in episode_success[e]:
- total_success.append(acc)
- for dist in episode_dist[e]:
- total_dist.append(dist)
- for spl in episode_spl[e]:
- total_spl.append(spl)
-
- if len(total_spl) > 0:
- log += " ObjectNav succ/spl/dtg:"
- log += " {:.3f}/{:.3f}/{:.3f}({:.0f}),".format(
- np.mean(total_success),
- np.mean(total_spl),
- np.mean(total_dist),
- len(total_spl))
- else:
- if len(episode_success) > 100:
- log += " ObjectNav succ/spl/dtg:"
- log += " {:.3f}/{:.3f}/{:.3f}({:.0f}),".format(
- np.mean(episode_success),
- np.mean(episode_spl),
- np.mean(episode_dist),
- len(episode_spl))
-
- log += "\n\tLosses:"
- if len(g_value_losses) > 0 and not args.eval:
- log += " ".join([
- " Policy Loss value/action/dist:",
- "{:.3f}/{:.3f}/{:.3f},".format(
- np.mean(g_value_losses),
- np.mean(g_action_losses),
- np.mean(g_dist_entropies))
- ])
-
- print(log)
- logging.info(log)
- # ------------------------------------------------------------------
-
- # ------------------------------------------------------------------
- # Save best models
- if (step * num_scenes) % args.save_interval < \
- num_scenes:
- if len(g_episode_rewards) >= 1000 and \
- (np.mean(g_episode_rewards) >= best_g_reward) \
- and not args.eval:
- torch.save(g_policy.state_dict(),
- os.path.join(log_dir, "model_best.pth"))
- best_g_reward = np.mean(g_episode_rewards)
-
- # Save periodic models
- if (step * num_scenes) % args.save_periodic < \
- num_scenes:
- total_steps = step * num_scenes
- if not args.eval:
- torch.save(g_policy.state_dict(),
- os.path.join(dump_dir,
- "periodic_{}.pth".format(total_steps)))
- # ------------------------------------------------------------------
-
- # Print and save model performance numbers during evaluation
- if args.eval:
- print("Dumping eval details...")
-
- total_success = []
- total_spl = []
- total_dist = []
- for e in range(args.num_processes):
- for acc in episode_success[e]:
- total_success.append(acc)
- for dist in episode_dist[e]:
- total_dist.append(dist)
- for spl in episode_spl[e]:
- total_spl.append(spl)
-
- if len(total_spl) > 0:
- log = "Final ObjectNav succ/spl/dtg:"
- log += " {:.3f}/{:.3f}/{:.3f}({:.0f}),".format(
- np.mean(total_success),
- np.mean(total_spl),
- np.mean(total_dist),
- len(total_spl))
-
- print(log)
- logging.info(log)
-
- # Save the spl per category
- log = "Success | SPL per category\n"
- for key in success_per_category:
- log += "{}: {} | {}\n".format(key,
- sum(success_per_category[key]) /
- len(success_per_category[key]),
- sum(spl_per_category[key]) /
- len(spl_per_category[key]))
-
- print(log)
- logging.info(log)
-
- with open('{}/{}_spl_per_cat_pred_thr.json'.format(
- dump_dir, args.split), 'w') as f:
- json.dump(spl_per_category, f)
-
- with open('{}/{}_success_per_cat_pred_thr.json'.format(
- dump_dir, args.split), 'w') as f:
- json.dump(success_per_category, f)
-
-
-if __name__ == "__main__":
- main()
diff --git a/model.py b/model.py
deleted file mode 100755
index c912ce0..0000000
--- a/model.py
+++ /dev/null
@@ -1,283 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-import numpy as np
-
-from utils.distributions import Categorical, DiagGaussian
-from utils.model import get_grid, ChannelPool, Flatten, NNBase
-import envs.utils.depth_utils as du
-
-
-class Goal_Oriented_Semantic_Policy(NNBase):
-
- def __init__(self, input_shape, recurrent=False, hidden_size=512,
- num_sem_categories=16):
- super(Goal_Oriented_Semantic_Policy, self).__init__(
- recurrent, hidden_size, hidden_size)
-
- out_size = int(input_shape[1] / 16.) * int(input_shape[2] / 16.)
-
- self.main = nn.Sequential(
- nn.MaxPool2d(2),
- nn.Conv2d(num_sem_categories + 8, 32, 3, stride=1, padding=1),
- nn.ReLU(),
- nn.MaxPool2d(2),
- nn.Conv2d(32, 64, 3, stride=1, padding=1),
- nn.ReLU(),
- nn.MaxPool2d(2),
- nn.Conv2d(64, 128, 3, stride=1, padding=1),
- nn.ReLU(),
- nn.MaxPool2d(2),
- nn.Conv2d(128, 64, 3, stride=1, padding=1),
- nn.ReLU(),
- nn.Conv2d(64, 32, 3, stride=1, padding=1),
- nn.ReLU(),
- Flatten()
- )
-
- self.linear1 = nn.Linear(out_size * 32 + 8 * 2, hidden_size)
- self.linear2 = nn.Linear(hidden_size, 256)
- self.critic_linear = nn.Linear(256, 1)
- self.orientation_emb = nn.Embedding(72, 8)
- self.goal_emb = nn.Embedding(num_sem_categories, 8)
- self.train()
-
- def forward(self, inputs, rnn_hxs, masks, extras):
- x = self.main(inputs)
- orientation_emb = self.orientation_emb(extras[:, 0])
- goal_emb = self.goal_emb(extras[:, 1])
-
- x = torch.cat((x, orientation_emb, goal_emb), 1)
-
- x = nn.ReLU()(self.linear1(x))
- if self.is_recurrent:
- x, rnn_hxs = self._forward_gru(x, rnn_hxs, masks)
-
- x = nn.ReLU()(self.linear2(x))
-
- return self.critic_linear(x).squeeze(-1), x, rnn_hxs
-
-
-# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/model.py#L15
-class RL_Policy(nn.Module):
-
- def __init__(self, obs_shape, action_space, model_type=0,
- base_kwargs=None):
-
- super(RL_Policy, self).__init__()
- if base_kwargs is None:
- base_kwargs = {}
-
- if model_type == 1:
- self.network = Goal_Oriented_Semantic_Policy(
- obs_shape, **base_kwargs)
- else:
- raise NotImplementedError
-
- if action_space.__class__.__name__ == "Discrete":
- num_outputs = action_space.n
- self.dist = Categorical(self.network.output_size, num_outputs)
- elif action_space.__class__.__name__ == "Box":
- num_outputs = action_space.shape[0]
- self.dist = DiagGaussian(self.network.output_size, num_outputs)
- else:
- raise NotImplementedError
-
- self.model_type = model_type
-
- @property
- def is_recurrent(self):
- return self.network.is_recurrent
-
- @property
- def rec_state_size(self):
- """Size of rnn_hx."""
- return self.network.rec_state_size
-
- def forward(self, inputs, rnn_hxs, masks, extras):
- if extras is None:
- return self.network(inputs, rnn_hxs, masks)
- else:
- return self.network(inputs, rnn_hxs, masks, extras)
-
- def act(self, inputs, rnn_hxs, masks, extras=None, deterministic=False):
-
- value, actor_features, rnn_hxs = self(inputs, rnn_hxs, masks, extras)
- dist = self.dist(actor_features)
-
- if deterministic:
- action = dist.mode()
- else:
- action = dist.sample()
-
- action_log_probs = dist.log_probs(action)
-
- return value, action, action_log_probs, rnn_hxs
-
- def get_value(self, inputs, rnn_hxs, masks, extras=None):
- value, _, _ = self(inputs, rnn_hxs, masks, extras)
- return value
-
- def evaluate_actions(self, inputs, rnn_hxs, masks, action, extras=None):
-
- value, actor_features, rnn_hxs = self(inputs, rnn_hxs, masks, extras)
- dist = self.dist(actor_features)
-
- action_log_probs = dist.log_probs(action)
- dist_entropy = dist.entropy().mean()
-
- return value, action_log_probs, dist_entropy, rnn_hxs
-
-
-class Semantic_Mapping(nn.Module):
-
- """
- Semantic_Mapping
- """
-
- def __init__(self, args):
- super(Semantic_Mapping, self).__init__()
-
- self.device = args.device
- self.screen_h = args.frame_height
- self.screen_w = args.frame_width
- self.resolution = args.map_resolution
- self.z_resolution = args.map_resolution
- self.map_size_cm = args.map_size_cm // args.global_downscaling
- self.n_channels = 3
- self.vision_range = args.vision_range
- self.dropout = 0.5
- self.fov = args.hfov
- self.du_scale = args.du_scale
- self.cat_pred_threshold = args.cat_pred_threshold
- self.exp_pred_threshold = args.exp_pred_threshold
- self.map_pred_threshold = args.map_pred_threshold
- self.num_sem_categories = args.num_sem_categories
-
- self.max_height = int(360 / self.z_resolution)
- self.min_height = int(-40 / self.z_resolution)
- self.agent_height = args.camera_height * 100.
- self.shift_loc = [self.vision_range *
- self.resolution // 2, 0, np.pi / 2.0]
- self.camera_matrix = du.get_camera_matrix(
- self.screen_w, self.screen_h, self.fov)
-
- self.pool = ChannelPool(1)
-
- vr = self.vision_range
-
- self.init_grid = torch.zeros(
- args.num_processes, 1 + self.num_sem_categories, vr, vr,
- self.max_height - self.min_height
- ).float().to(self.device)
- self.feat = torch.ones(
- args.num_processes, 1 + self.num_sem_categories,
- self.screen_h // self.du_scale * self.screen_w // self.du_scale
- ).float().to(self.device)
-
- def forward(self, obs, pose_obs, maps_last, poses_last):
- bs, c, h, w = obs.size()
- depth = obs[:, 3, :, :]
-
- point_cloud_t = du.get_point_cloud_from_z_t(
- depth, self.camera_matrix, self.device, scale=self.du_scale)
-
- agent_view_t = du.transform_camera_view_t(
- point_cloud_t, self.agent_height, 0, self.device)
-
- agent_view_centered_t = du.transform_pose_t(
- agent_view_t, self.shift_loc, self.device)
-
- max_h = self.max_height
- min_h = self.min_height
- xy_resolution = self.resolution
- z_resolution = self.z_resolution
- vision_range = self.vision_range
- XYZ_cm_std = agent_view_centered_t.float()
- XYZ_cm_std[..., :2] = (XYZ_cm_std[..., :2] / xy_resolution)
- XYZ_cm_std[..., :2] = (XYZ_cm_std[..., :2] -
- vision_range // 2.) / vision_range * 2.
- XYZ_cm_std[..., 2] = XYZ_cm_std[..., 2] / z_resolution
- XYZ_cm_std[..., 2] = (XYZ_cm_std[..., 2] -
- (max_h + min_h) // 2.) / (max_h - min_h) * 2.
- self.feat[:, 1:, :] = nn.AvgPool2d(self.du_scale)(
- obs[:, 4:, :, :]
- ).view(bs, c - 4, h // self.du_scale * w // self.du_scale)
-
- XYZ_cm_std = XYZ_cm_std.permute(0, 3, 1, 2)
- XYZ_cm_std = XYZ_cm_std.view(XYZ_cm_std.shape[0],
- XYZ_cm_std.shape[1],
- XYZ_cm_std.shape[2] * XYZ_cm_std.shape[3])
-
- voxels = du.splat_feat_nd(
- self.init_grid * 0., self.feat, XYZ_cm_std).transpose(2, 3)
-
- min_z = int(25 / z_resolution - min_h)
- max_z = int((self.agent_height + 1) / z_resolution - min_h)
-
- agent_height_proj = voxels[..., min_z:max_z].sum(4)
- all_height_proj = voxels.sum(4)
-
- fp_map_pred = agent_height_proj[:, 0:1, :, :]
- fp_exp_pred = all_height_proj[:, 0:1, :, :]
- fp_map_pred = fp_map_pred / self.map_pred_threshold
- fp_exp_pred = fp_exp_pred / self.exp_pred_threshold
- fp_map_pred = torch.clamp(fp_map_pred, min=0.0, max=1.0)
- fp_exp_pred = torch.clamp(fp_exp_pred, min=0.0, max=1.0)
-
- pose_pred = poses_last
-
- agent_view = torch.zeros(bs, c,
- self.map_size_cm // self.resolution,
- self.map_size_cm // self.resolution
- ).to(self.device)
-
- x1 = self.map_size_cm // (self.resolution * 2) - self.vision_range // 2
- x2 = x1 + self.vision_range
- y1 = self.map_size_cm // (self.resolution * 2)
- y2 = y1 + self.vision_range
- agent_view[:, 0:1, y1:y2, x1:x2] = fp_map_pred
- agent_view[:, 1:2, y1:y2, x1:x2] = fp_exp_pred
- agent_view[:, 4:, y1:y2, x1:x2] = torch.clamp(
- agent_height_proj[:, 1:, :, :] / self.cat_pred_threshold,
- min=0.0, max=1.0)
-
- corrected_pose = pose_obs
-
- def get_new_pose_batch(pose, rel_pose_change):
-
- pose[:, 1] += rel_pose_change[:, 0] * \
- torch.sin(pose[:, 2] / 57.29577951308232) \
- + rel_pose_change[:, 1] * \
- torch.cos(pose[:, 2] / 57.29577951308232)
- pose[:, 0] += rel_pose_change[:, 0] * \
- torch.cos(pose[:, 2] / 57.29577951308232) \
- - rel_pose_change[:, 1] * \
- torch.sin(pose[:, 2] / 57.29577951308232)
- pose[:, 2] += rel_pose_change[:, 2] * 57.29577951308232
-
- pose[:, 2] = torch.fmod(pose[:, 2] - 180.0, 360.0) + 180.0
- pose[:, 2] = torch.fmod(pose[:, 2] + 180.0, 360.0) - 180.0
-
- return pose
-
- current_poses = get_new_pose_batch(poses_last, corrected_pose)
- st_pose = current_poses.clone().detach()
-
- st_pose[:, :2] = - (st_pose[:, :2]
- * 100.0 / self.resolution
- - self.map_size_cm // (self.resolution * 2)) /\
- (self.map_size_cm // (self.resolution * 2))
- st_pose[:, 2] = 90. - (st_pose[:, 2])
-
- rot_mat, trans_mat = get_grid(st_pose, agent_view.size(),
- self.device)
-
- rotated = F.grid_sample(agent_view, rot_mat, align_corners=True)
- translated = F.grid_sample(rotated, trans_mat, align_corners=True)
-
- maps2 = torch.cat((maps_last.unsqueeze(1), translated.unsqueeze(1)), 1)
-
- map_pred, _ = torch.max(maps2, 1)
-
- return fp_map_pred, map_pred, pose_pred, current_poses
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 8c8800c..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-scikit-fmm==2019.1.30
-scikit-learn==0.22.2.post1
-scikit-image==0.15.0
-numpy>=1.20.2
-ifcfg
diff --git a/semantic_exploration/README.md b/semantic_exploration/README.md
new file mode 100644
index 0000000..f0c4e4d
--- /dev/null
+++ b/semantic_exploration/README.md
@@ -0,0 +1 @@
+# semantic_exploration
diff --git a/semantic_exploration/agents/sem_exp.py b/semantic_exploration/agents/sem_exp.py
new file mode 100644
index 0000000..5fb1ec3
--- /dev/null
+++ b/semantic_exploration/agents/sem_exp.py
@@ -0,0 +1,577 @@
+# -*- coding: utf-8 -*-
+import math
+import os
+
+import third_party.semantic_exploration.agents.utils.visualization as vu
+import cv2
+import third_party.semantic_exploration.envs.utils.pose as pu
+from third_party.semantic_exploration.envs.utils.fmm_planner import FMMPlanner
+import numpy as np
+import skimage.morphology
+from third_party.semantic_exploration.agents.utils.detic_semantic_prediction import SemanticPredDetic
+from third_party.semantic_exploration.agents.utils.owlvit_semantic_prediction import SemanticPredOwlvit
+from third_party.semantic_exploration.agents.utils.semantic_prediction import SemanticPredMaskRCNN
+from third_party.semantic_exploration.constants import color_palette
+from PIL import Image
+from torchvision import transforms
+
+
+class Sem_Exp_Env_Agent:
+ """The Sem_Exp environment agent class. A seperate Sem_Exp_Env_Agent class
+ object is used for each environment thread.
+
+ """
+
+ def __init__(self, config, rank=1):
+ self.config = config
+ # initialize transform for RGB observations
+ self.res = transforms.Compose(
+ [
+ transforms.ToPILImage(),
+ transforms.Resize(
+ (self.config.FRAME_HEIGHT, self.config.FRAME_WIDTH),
+ interpolation=Image.NEAREST,
+ ),
+ ]
+ )
+
+ if self.config.DETECTION_MODEL == "detectron2":
+ self.sem_pred = SemanticPredMaskRCNN(self.config)
+ elif self.config.DETECTION_MODEL == "detic":
+ self.sem_pred = SemanticPredDetic(self.config)
+ elif self.config.DETECTION_MODEL == "owlvit":
+ self.sem_pred = SemanticPredOwlvit(self.config)
+ else:
+ raise NotImplementedError
+
+ # initializations for planning:
+ self.selem = skimage.morphology.disk(self.config.OBS_DILATION_SELEM_RADIUS)
+ self.obs = None
+ self.info = None
+ self.obs_shape = None
+ self.collision_map = None
+ self.visited = None
+ self.visited_vis = None
+ self.col_width = None
+ self.curr_loc = None
+ self.last_loc = None
+ self.last_action = None
+ self.count_forward_actions = None
+
+ if self.config.PLANNER == "frontier":
+ self.start_obs_dilation_selem_radius = self.config.OBS_DILATION_SELEM_RADIUS
+ self.goal_dilation_selem_radius = self.config.GOAL_DILATION_SELEM_RADIUS
+ self.min_obs_dilation_selem_radius = (
+ self.config.MIN_OBS_DILATION_SELEM_RADIUS
+ )
+ self.agent_cell_radius = self.config.AGENT_CELL_RADIUS
+ self.goal_tolerance = self.config.GOAL_TOLERANCE
+ self.continuous_angle_tolerance = self.config.CONTINUOUS_ANGLE_TOLERANCE
+ self.curr_obs_dilation_selem_radius = None
+ self.obs_dilation_selem = None
+
+ if self.config.VISUALIZE:
+ this_dir = os.path.dirname(os.path.abspath(__file__))
+ semantic_exploration_dir = os.path.join(os.path.dirname(this_dir))
+ self.legend = cv2.imread(semantic_exploration_dir+"/docs/legend.png")
+ self.vis_image = None
+ self.rgb_vis = None
+ self.depth_vis = None
+ self.goal_name = None
+ self.timestep = 0
+ self.rank = rank
+ self.episode_no = 0
+ self.cur_stg = None
+
+ def reset(self, obs_size, goal_name):
+ self.info = None
+ self.obs_shape = obs_size
+ self.goal_name = goal_name
+
+ # Episode initializations
+ map_shape = (
+ self.config.MAP_SIZE_CM // self.config.MAP_RESOLUTION,
+ self.config.MAP_SIZE_CM // self.config.MAP_RESOLUTION,
+ )
+ self.collision_map = np.zeros(map_shape)
+ self.visited = np.zeros(map_shape)
+ self.visited_vis = np.zeros(map_shape)
+ self.col_width = 1
+ self.count_forward_actions = 0
+ self.curr_loc = [
+ self.config.MAP_SIZE_CM / 100.0 / 2.0,
+ self.config.MAP_SIZE_CM / 100.0 / 2.0,
+ 0.0,
+ ]
+ self.last_action = None
+
+ if self.config.PLANNER == "frontier":
+ self.curr_obs_dilation_selem_radius = self.start_obs_dilation_selem_radius
+ self.obs_dilation_selem = skimage.morphology.disk(
+ self.curr_obs_dilation_selem_radius
+ )
+
+ if self.config.VISUALIZE:
+ self.vis_image = vu.init_vis_image(self.goal_name, self.legend)
+ self.timestep = 0
+
+ def update_vis_image_goal(self, goal_name):
+ self.goal_name = goal_name
+ if self.config.VISUALIZE:
+ self.vis_image = vu.init_vis_image(self.goal_name, self.legend)
+
+ def plan_act_and_preprocess(self, planner_inputs, info):
+ """Function responsible for planning, taking the action and
+ preprocessing observations
+
+ Args:
+ planner_inputs (dict):
+ dict with following keys:
+ 'map_pred' (ndarray): (M, M) map prediction
+ 'goal' (ndarray): (M, M) mat denoting goal locations
+ 'pose_pred' (ndarray): (7,) array denoting pose (x,y,o)
+ and planning window (gx1, gx2, gy1, gy2)
+ 'found_goal' (bool): whether the goal object is found
+
+ Returns:
+ obs (ndarray): preprocessed observations ((4+C) x H x W)
+ reward (float): amount of reward returned after previous action
+ done (bool): whether the episode has ended
+ info (dict): contains timestep, pose, goal category and
+ evaluation metric info
+ """
+
+ self.info = info
+ # plan
+ if planner_inputs["wait"]:
+ self.last_action = None
+ self.info["sensor_pose"] = [0.0, 0.0, 0.0]
+ return np.zeros(self.obs.shape), 0.0, False, self.info
+
+ action = self._plan(planner_inputs)
+
+ if self.config.VISUALIZE:
+ self._visualize(planner_inputs)
+
+ self.timestep += 1
+
+ if action >= 0:
+ # act
+ action = {"action": action}
+ obs = self.info["state"]
+ self.last_action = action["action"]
+ self.obs = obs
+ self.info = info
+ self.info["action"] = action
+
+ return obs, 0.0, False, info
+
+ else:
+ self.last_action = None
+ self.info["sensor_pose"] = [0.0, 0.0, 0.0]
+ self.info["action"] = -1
+ return np.zeros(self.obs_shape), 0.0, False, self.info
+
+ def _reach_goal_if_in_map(self, goal_map, found_goal):
+ height = goal_map.shape[0]
+ width = goal_map.shape[1]
+ init_goal_map = np.zeros((height, width))
+ if found_goal:
+ init_goal_map = goal_map
+ return init_goal_map
+
+ def _explore_otherwise(self, exp_pred, goal_map, found_goal):
+ """Explore closest unexplored region otherwise."""
+ # Select unexplored area
+ frontier_map = exp_pred == 0
+ self.dilate_explored_kernel = skimage.morphology.disk(10)
+ # Dilate explored area
+ frontier_map = 1 - skimage.morphology.binary_dilation(
+ 1 - frontier_map, self.dilate_explored_kernel
+ )
+
+ self.select_border_kernel = skimage.morphology.disk(1)
+ # Select the frontier
+ frontier_map = (
+ skimage.morphology.binary_dilation(frontier_map, self.select_border_kernel)
+ - frontier_map
+ )
+
+ if not found_goal:
+ goal_map = frontier_map
+
+ return goal_map
+
+ def _plan(self, planner_inputs):
+ """Function responsible for planning
+
+ Args:
+ planner_inputs (dict):
+ dict with following keys:
+ 'map_pred' (ndarray): (M, M) map prediction
+ 'goal' (ndarray): (M, M) goal locations
+ 'pose_pred' (ndarray): (7,) array denoting pose (x,y,o)
+ and planning window (gx1, gx2, gy1, gy2)
+ 'found_goal' (bool): whether the goal object is found
+
+ Returns:
+ action (int): action id
+ """
+
+ self.last_loc = self.curr_loc
+
+ # Get Map prediction (obstacle)
+ map_pred = np.rint(planner_inputs["map_pred"])
+ if self.config.PLANNER == "frontier":
+ goal = self._reach_goal_if_in_map(
+ planner_inputs["goal"], planner_inputs["found_goal"]
+ )
+ goal = self._explore_otherwise(
+ planner_inputs["exp_pred"], goal, planner_inputs["found_goal"]
+ )
+ else:
+ goal = planner_inputs["goal"]
+
+ # Get pose prediction and global policy planning window
+ start_x, start_y, start_o, gx1, gx2, gy1, gy2 = planner_inputs["pose_pred"]
+ gx1, gx2, gy1, gy2 = int(gx1), int(gx2), int(gy1), int(gy2)
+ planning_window = [gx1, gx2, gy1, gy2]
+
+ # Get curr loc
+ self.curr_loc = [start_x, start_y, start_o]
+ r, c = start_y, start_x
+ start = [
+ int(r * 100.0 / self.config.MAP_RESOLUTION - gx1),
+ int(c * 100.0 / self.config.MAP_RESOLUTION - gy1),
+ ]
+ start = pu.threshold_poses(start, map_pred.shape)
+
+ self.visited[gx1:gx2, gy1:gy2][
+ start[0] - 0 : start[0] + 1, start[1] - 0 : start[1] + 1
+ ] = 1
+
+ if self.config.VISUALIZE:
+ # Get last loc
+ last_start_x, last_start_y = self.last_loc[0], self.last_loc[1]
+ r, c = last_start_y, last_start_x
+ last_start = [
+ int(r * 100.0 / self.config.MAP_RESOLUTION - gx1),
+ int(c * 100.0 / self.config.MAP_RESOLUTION - gy1),
+ ]
+ last_start = pu.threshold_poses(last_start, map_pred.shape)
+ self.visited_vis[gx1:gx2, gy1:gy2] = vu.draw_line(
+ last_start, start, self.visited_vis[gx1:gx2, gy1:gy2]
+ )
+
+ # Collision check
+ if self.last_action == 1:
+ x1, y1, t1 = self.last_loc
+ x2, y2, _ = self.curr_loc
+ buf = 4
+ length = 2
+
+ if abs(x1 - x2) < 0.05 and abs(y1 - y2) < 0.05:
+ self.col_width += 2
+ if self.col_width == 7:
+ length = 4
+ buf = 3
+ self.col_width = min(self.col_width, 5)
+ else:
+ self.col_width = 1
+
+ dist = pu.get_l2_distance(x1, x2, y1, y2)
+ if dist < self.config.COLLISION_THRESHOLD: # Collision
+ width = self.col_width
+ for i in range(length):
+ for j in range(width):
+ wx = x1 + 0.05 * (
+ (i + buf) * np.cos(np.deg2rad(t1))
+ + (j - width // 2) * np.sin(np.deg2rad(t1))
+ )
+ wy = y1 + 0.05 * (
+ (i + buf) * np.sin(np.deg2rad(t1))
+ - (j - width // 2) * np.cos(np.deg2rad(t1))
+ )
+ r, c = wy, wx
+ r, c = int(r * 100 / self.config.MAP_RESOLUTION), int(
+ c * 100 / self.config.MAP_RESOLUTION
+ )
+ [r, c] = pu.threshold_poses([r, c], self.collision_map.shape)
+ self.collision_map[r, c] = 1
+
+ stg, replan, stop = self._get_stg(
+ map_pred, start, np.copy(goal), planning_window
+ )
+
+ # We were not able to find a path to the high-level goal
+ if replan and self.config.PLANNER == "frontier":
+ # Clean collision map
+ self.collision_map *= 0
+
+ # Reduce obstacle dilation
+ if self.curr_obs_dilation_selem_radius > 1:
+ self.curr_obs_dilation_selem_radius -= 1
+ self.obs_dilation_selem = skimage.morphology.disk(
+ self.curr_obs_dilation_selem_radius
+ )
+
+ # Deterministic Local Policy
+ if stop and planner_inputs["found_goal"] == 1:
+ if self._get_distance_to_obstacle() <= 0.2:
+ action = 0
+ else:
+ action = 1
+ else:
+ (stg_x, stg_y) = stg
+ angle_st_goal = math.degrees(math.atan2(stg_x - start[0], stg_y - start[1]))
+ angle_agent = (start_o) % 360.0
+ if angle_agent > 180:
+ angle_agent -= 360
+
+ relative_angle = (angle_agent - angle_st_goal) % 360.0
+ if relative_angle > 180:
+ relative_angle -= 360
+
+ if relative_angle > self.config.TURN_ANGLE / 2.0:
+ # Right
+ action = 3
+ elif relative_angle < -self.config.TURN_ANGLE / 2.0:
+ # Left
+ action = 2
+ else:
+ # Forward
+ action = 1
+
+ self.cur_stg = stg
+
+ return action
+
+ def _get_stg(self, grid, start, goal, planning_window):
+ """Get short-term goal"""
+
+ [gx1, gx2, gy1, gy2] = planning_window
+
+ x1, y1, = (
+ 0,
+ 0,
+ )
+ x2, y2 = grid.shape
+
+ def add_boundary(mat, value=1):
+ h, w = mat.shape
+ new_mat = np.zeros((h + 2, w + 2)) + value
+ new_mat[1 : h + 1, 1 : w + 1] = mat
+ return new_mat
+
+ if self.config.PLANNER == "frontier":
+ obstacles = grid[x1:x2, y1:y2]
+ # Dilate obstacles
+ dilated_obstacles = cv2.dilate(
+ obstacles, self.obs_dilation_selem, iterations=1
+ )
+ traversible = 1 - dilated_obstacles
+ else:
+ traversible = (
+ skimage.morphology.binary_dilation(grid[x1:x2, y1:y2], self.selem)
+ != True # noqa
+ )
+ traversible[self.collision_map[gx1:gx2, gy1:gy2][x1:x2, y1:y2] == 1] = 0
+ traversible[self.visited[gx1:gx2, gy1:gy2][x1:x2, y1:y2] == 1] = 1
+
+ traversible[
+ int(start[0] - x1) - 1 : int(start[0] - x1) + 2,
+ int(start[1] - y1) - 1 : int(start[1] - y1) + 2,
+ ] = 1
+
+ traversible = add_boundary(traversible)
+ goal = add_boundary(goal, value=0)
+
+ planner = FMMPlanner(traversible, step_size=self.config.PLANNER_STEP_SIZE)
+ # Set the goal size
+ selem = skimage.morphology.disk(self.config.GOAL_DILATION_SELEM_RADIUS)
+ goal = skimage.morphology.binary_dilation(goal, selem) != True # noqa
+ goal = 1 - goal * 1.0
+ planner.set_multi_goal(goal)
+
+
+ if self.config.VISUALIZE:
+ dump_dir = "{}/dump/{}/".format(self.config.DUMP_LOCATION, self.config.EXP_NAME)
+ ep_dir = "{}/episodes/thread_{}/eps_{}/".format(
+ dump_dir, self.rank, self.episode_no
+ )
+ if not os.path.exists(ep_dir):
+ os.makedirs(ep_dir)
+ r, c = traversible.shape
+ dist_vis = np.zeros((r, c * 3))
+ dist_vis[:, :c] = np.flipud(traversible)
+ dist_vis[:, c : 2 * c] = np.flipud(goal)
+ dist_vis[:, 2 * c :] = np.flipud(planner.fmm_dist / planner.fmm_dist.max())
+
+ fn = "{}/episodes/thread_{}/eps_{}/frontier-{}-{}-Vis-{}.png".format(
+ dump_dir,
+ self.rank,
+ self.episode_no,
+ self.rank,
+ self.episode_no,
+ self.timestep,
+ )
+
+ font = cv2.FONT_HERSHEY_SIMPLEX
+ fontScale = 0.3
+ color = (0, 0, 255) # BGR
+ thickness = 1
+ dist_vis = cv2.cvtColor((255.0 * dist_vis).astype(np.uint8), cv2.COLOR_GRAY2BGR)
+ dist_vis = cv2.putText(dist_vis, "trav. (w: trav.; b: can't tarv.)", (2, 25), font, fontScale, color, thickness, cv2.LINE_AA)
+ dist_vis = cv2.putText(dist_vis, "goal (w: goal; b: non-goal)", (c+2,25), font, fontScale, color, thickness, cv2.LINE_AA)
+ dist_vis = cv2.putText(dist_vis, "trav.+goal (w: non-goal target; b: goal target)", (2*c+2,25), font, fontScale, color, thickness, cv2.LINE_AA)
+ cv2.imwrite(fn, dist_vis.astype(np.uint8))
+ cv2.waitKey(1)
+
+ state = [start[0] - x1 + 1, start[1] - y1 + 1]
+ # Add the replan flag
+ stg_x, stg_y, replan, stop = planner.get_short_term_goal(state)
+
+ stg_x, stg_y = stg_x + x1 - 1, stg_y + y1 - 1
+
+ return (stg_x, stg_y), replan, stop
+
+ def _preprocess_obs(self, obs, use_seg=True):
+ obs = obs.transpose(1, 2, 0)
+ rgb = obs[:, :, :3]
+ depth = obs[:, :, 3:4]
+
+ sem_seg_pred = self._get_sem_pred(rgb.astype(np.uint8), use_seg=use_seg)
+ self.depth_vis = depth
+ depth = self._preprocess_depth(
+ depth, self.config.MIN_DEPTH, self.config.MAX_DEPTH
+ )
+
+ ds = (
+ self.config.ENV_FRAME_WIDTH // self.config.FRAME_WIDTH
+ ) # Downscaling factor
+ if ds != 1:
+ rgb = np.asarray(self.res(rgb.astype(np.uint8)))
+ depth = depth[ds // 2 :: ds, ds // 2 :: ds]
+ sem_seg_pred = sem_seg_pred[ds // 2 :: ds, ds // 2 :: ds]
+
+ depth = np.expand_dims(depth, axis=2)
+ state = np.concatenate((rgb, depth, sem_seg_pred), axis=2).transpose(2, 0, 1)
+ return state
+
+ def _preprocess_depth(self, depth, min_d, max_d):
+ depth = depth[:, :, 0] * 1
+
+ for i in range(depth.shape[1]):
+ depth[:, i][depth[:, i] == 0.0] = depth[:, i].max()
+
+ mask2 = depth > 0.99
+ depth[mask2] = 0.0
+
+ mask1 = depth == 0
+ depth[mask1] = 100.0
+ depth = min_d * 100.0 + depth * max_d * 100.0
+ return depth
+
+ def _get_sem_pred(self, rgb, use_seg=True):
+ if use_seg:
+ semantic_pred, self.rgb_vis = self.sem_pred.get_prediction(rgb)
+ semantic_pred = semantic_pred.astype(np.float32)
+ else:
+ semantic_pred = np.zeros((rgb.shape[0], rgb.shape[1], 16))
+ self.rgb_vis = rgb[:, :, ::-1]
+ return semantic_pred
+
+ def _get_distance_to_obstacle(self):
+ """"Return the distance between the obstacle and the robot."""
+ x1, y1, t1 = self.last_loc
+ x2, y2, _ = self.curr_loc
+ dist = pu.get_l2_distance(x1, x2, y1, y2)
+ return dist
+
+
+ def _visualize(self, inputs):
+ dump_dir = "{}/dump/{}/".format(self.config.DUMP_LOCATION, self.config.EXP_NAME)
+ ep_dir = "{}/episodes/thread_{}/eps_{}/".format(
+ dump_dir, self.rank, self.episode_no
+ )
+ if not os.path.exists(ep_dir):
+ os.makedirs(ep_dir)
+
+ map_pred = inputs["map_pred"]
+ exp_pred = inputs["exp_pred"]
+ start_x, start_y, start_o, gx1, gx2, gy1, gy2 = inputs["pose_pred"]
+
+ goal = inputs["goal"]
+ goal[int(self.cur_stg[0]), int(self.cur_stg[1])] = 1
+ sem_map = inputs["sem_map_pred"]
+
+ gx1, gx2, gy1, gy2 = int(gx1), int(gx2), int(gy1), int(gy2)
+
+ sem_map += 5
+
+ no_cat_mask = sem_map == self.config.NUM_SEM_CATEGORIES + 4 # 20
+ map_mask = np.rint(map_pred) == 1
+ exp_mask = np.rint(exp_pred) == 1
+ vis_mask = self.visited_vis[gx1:gx2, gy1:gy2] == 1
+
+ sem_map[no_cat_mask] = 0
+ m1 = np.logical_and(no_cat_mask, exp_mask)
+ sem_map[m1] = 2
+
+ m2 = np.logical_and(no_cat_mask, map_mask)
+ sem_map[m2] = 1
+
+ sem_map[vis_mask] = 3
+
+ selem = skimage.morphology.disk(self.goal_dilation_selem_radius)
+ goal_mat = 1 - skimage.morphology.binary_dilation(goal, selem) != True # noqa
+
+ goal_mask = goal_mat == 1
+ sem_map[goal_mask] = 4
+
+ color_pal = [int(x * 255.0) for x in color_palette]
+ sem_map_vis = Image.new("P", (sem_map.shape[1], sem_map.shape[0]))
+ sem_map_vis.putpalette(color_pal)
+ sem_map_vis.putdata(sem_map.flatten().astype(np.uint8))
+ sem_map_vis = sem_map_vis.convert("RGB")
+ sem_map_vis = np.flipud(sem_map_vis)
+
+ sem_map_vis = sem_map_vis[:, :, [2, 1, 0]]
+ sem_map_vis = cv2.resize(
+ sem_map_vis, (480, 480), interpolation=cv2.INTER_NEAREST
+ )
+ self.depth_vis = cv2.cvtColor((255.0 * self.depth_vis).astype(np.uint8), cv2.COLOR_GRAY2BGR)
+ self.vis_image[
+ 50 : 50 + self.config.ENV_FRAME_HEIGHT,
+ 15 : 15 + self.config.ENV_FRAME_WIDTH,
+ ] = self.rgb_vis # depth_vis or rgb_vis
+ self.vis_image[50:530, 670:1150] = sem_map_vis
+
+ pos = (
+ (start_x * 100.0 / self.config.MAP_RESOLUTION - gy1)
+ * 480
+ / map_pred.shape[0],
+ (map_pred.shape[1] - start_y * 100.0 / self.config.MAP_RESOLUTION + gx1)
+ * 480
+ / map_pred.shape[1],
+ np.deg2rad(-start_o),
+ )
+
+ agent_arrow = vu.get_contour_points(pos, origin=(670, 50))
+ color = (
+ int(color_palette[11] * 255),
+ int(color_palette[10] * 255),
+ int(color_palette[9] * 255),
+ )
+ cv2.drawContours(self.vis_image, [agent_arrow], 0, color, -1)
+
+ if self.config.VISUALIZE:
+ fn = "{}/episodes/thread_{}/eps_{}/{}-{}-Vis-{}.png".format(
+ dump_dir,
+ self.rank,
+ self.episode_no,
+ self.rank,
+ self.episode_no,
+ self.timestep,
+ )
+ cv2.imwrite(fn, self.vis_image)
diff --git a/semantic_exploration/agents/utils/detic_semantic_prediction.py b/semantic_exploration/agents/utils/detic_semantic_prediction.py
new file mode 100644
index 0000000..e14d044
--- /dev/null
+++ b/semantic_exploration/agents/utils/detic_semantic_prediction.py
@@ -0,0 +1,338 @@
+# The following code is largely borrowed from
+# https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py and
+# https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
+
+import argparse
+import pathlib
+import sys
+import time
+from pathlib import Path
+
+import detectron2.data.transforms as T
+import numpy as np
+import torch
+
+ROOT_DETIC = str(Path(__file__).resolve().parent).split("third_party")[0]+"third_party/"
+sys.path.insert(0, ROOT_DETIC + "Detic/third_party/CenterNet2")
+sys.path.insert(0, ROOT_DETIC + "Detic")
+from centernet.config import add_centernet_config # noqa: E402
+from third_party.semantic_exploration.constants import coco_categories_mapping # noqa: E402
+from detectron2.checkpoint import DetectionCheckpointer # noqa: E402
+from detectron2.config import get_cfg # noqa: E402
+from detectron2.data.catalog import MetadataCatalog # noqa: E402
+from detectron2.engine.defaults import DefaultPredictor # noqa: E402
+from detectron2.modeling import build_model # noqa: E402
+from detectron2.utils.logger import setup_logger # noqa: E402
+from detectron2.utils.visualizer import ColorMode, Visualizer # noqa: E402
+from detic.config import add_detic_config # noqa: E402
+from detic.modeling.text.text_encoder import build_text_encoder # noqa: E402
+from detic.modeling.utils import reset_cls_test # noqa: E402
+
+BUILDIN_CLASSIFIER = {
+ "lvis": ROOT_DETIC + "Detic/datasets/metadata/lvis_v1_clip_a+cname.npy",
+ "objects365": ROOT_DETIC + "Detic/datasets/metadata/o365_clip_a+cnamefix.npy",
+ "openimages": ROOT_DETIC + "Detic/datasets/metadata/oid_clip_a+cname.npy",
+ "coco": ROOT_DETIC + "Detic/datasets/metadata/coco_clip_a+cname.npy",
+}
+
+BUILDIN_METADATA_PATH = {
+ "lvis": "lvis_v1_val",
+ "objects365": "objects365_v2_val",
+ "openimages": "oid_val_expanded",
+ "coco": "coco_2017_val",
+}
+
+
+class SemanticPredDetic:
+ def __init__(self, args):
+ self.segmentation_model = ImageSegmentation(args)
+ self.args = args
+
+ def get_prediction(self, img):
+ args = self.args
+ image_list = []
+ img = img[:, :, ::-1]
+ image_list.append(img)
+ seg_predictions, vis_output = self.segmentation_model.get_predictions(
+ image_list, visualize=args.visualize == 2
+ )
+
+ if args.visualize == 2:
+ img = vis_output.get_image()
+
+ semantic_input = np.zeros(
+ (img.shape[0], img.shape[1], 16 + 1)
+ ) # self.args.num_sem_categories )) #15 + 1))
+
+ for j, class_idx in enumerate(
+ seg_predictions[0]["instances"].pred_classes.cpu().numpy()
+ ):
+ if class_idx in list(coco_categories_mapping.keys()):
+ idx = coco_categories_mapping[class_idx]
+ obj_mask = seg_predictions[0]["instances"].pred_masks[j] * 1.0
+ semantic_input[:, :, idx] += obj_mask.cpu().numpy()
+ # The shape of the semantic input is (480, 640, 17)
+ return semantic_input, img
+
+
+def compress_sem_map(sem_map):
+ c_map = np.zeros((sem_map.shape[1], sem_map.shape[2]))
+ for i in range(sem_map.shape[0]):
+ c_map[sem_map[i] > 0.0] = i + 1
+ return c_map
+
+
+class ImageSegmentation:
+ def __init__(self, args):
+ string_args = """
+ --config-file {}
+ --input input1.jpeg
+ --vocabulary coco
+ --confidence-threshold {}
+ --opts MODEL.WEIGHTS {}
+ """.format(
+ ROOT_DETIC + "/Detic/configs/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml",
+ args.sem_pred_prob_thr,
+ ROOT_DETIC + "/Detic/configs/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth"
+ )
+
+ if args.sem_gpu_id == -2:
+ string_args += """ MODEL.DEVICE cpu"""
+ else:
+ string_args += """ MODEL.DEVICE cuda:{}""".format(args.sem_gpu_id)
+
+ string_args = string_args.split()
+
+ args = get_seg_parser().parse_args(string_args)
+ logger = setup_logger()
+ logger.info("Arguments: " + str(args))
+
+ cfg = setup_cfg(args)
+
+ assert args.vocabulary in ["coco", "custom"]
+ if args.vocabulary == "custom":
+ raise NotImplementedError
+ elif args.vocabulary == "coco":
+ self.metadata = MetadataCatalog.get(BUILDIN_METADATA_PATH[args.vocabulary])
+ classifier = BUILDIN_CLASSIFIER[args.vocabulary]
+ self.categories_mapping = {
+ 56: 0, # chair
+ 57: 1, # couch
+ 58: 2, # plant
+ 59: 3, # bed
+ 61: 4, # toilet
+ 62: 5, # tv
+ 60: 6, # table
+ 69: 7, # oven
+ 71: 8, # sink
+ 72: 9, # refrigerator
+ 73: 10, # book
+ 74: 11, # clock
+ 75: 12, # vase
+ 41: 13, # cup
+ 39: 14, # bottle
+ }
+
+ self.num_sem_categories = len(self.categories_mapping)
+ num_classes = len(self.metadata.thing_classes)
+ self.instance_mode = ColorMode.IMAGE
+ self.demo = VisualizationDemo(cfg, classifier, num_classes)
+
+ def get_predictions(self, img, visualize=0):
+ return self.demo.run_on_image(img, visualize=visualize)
+
+
+def setup_cfg(args):
+ cfg = get_cfg()
+ # We forcefully use cpu here
+ cfg.MODEL.DEVICE = "cpu"
+ add_centernet_config(cfg)
+ add_detic_config(cfg)
+ cfg.merge_from_file(args.config_file)
+ cfg.merge_from_list(args.opts)
+ # Set score_threshold for builtin models
+ cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
+ cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
+ cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = (
+ args.confidence_threshold
+ )
+ cfg.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_PATH = "rand" # load later
+ cfg.MODEL.ROI_HEADS.ONE_CLASS_PER_PROPOSAL = True
+ cfg.MODEL.ROI_BOX_HEAD.CAT_FREQ_PATH = (
+ ROOT_DETIC + "Detic/" + cfg.MODEL.ROI_BOX_HEAD.CAT_FREQ_PATH
+ )
+ # Fix cfg paths given we're not running from the Detic folder
+ cfg.MODEL.TEST_CLASSIFIERS[0] = (
+ ROOT_DETIC + "Detic/" + cfg.MODEL.TEST_CLASSIFIERS[0]
+ )
+ cfg.MODEL.TEST_CLASSIFIERS[1] = (
+ ROOT_DETIC + "Detic/" + cfg.MODEL.TEST_CLASSIFIERS[1]
+ )
+ cfg.freeze()
+ return cfg
+
+
+class VisualizationDemo(object):
+ def __init__(self, cfg, classifier, num_classes, instance_mode=ColorMode.IMAGE):
+ """
+ Args:
+ cfg (CfgNode):
+ instance_mode (ColorMode):
+ """
+ self.metadata = MetadataCatalog.get(
+ cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
+ )
+ self.cpu_device = torch.device("cpu")
+ self.instance_mode = instance_mode
+ self.predictor = BatchPredictor(cfg)
+
+ if type(classifier) == pathlib.PosixPath:
+ classifier = str(classifier)
+ reset_cls_test(self.predictor.model, classifier, num_classes)
+
+ def run_on_image(self, image_list, visualize=0):
+ """
+ Args:
+ image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+ This is the format used by OpenCV.
+
+ Returns:
+ predictions (dict): the output of the model.
+ vis_output (VisImage): the visualized image output.
+ """
+ vis_output = None
+ all_predictions = self.predictor(image_list)
+
+ # Convert image from OpenCV BGR format to Matplotlib RGB format.
+ if visualize:
+ predictions = all_predictions[0]
+ image = image_list[0]
+ visualizer = Visualizer(
+ image, self.metadata, instance_mode=self.instance_mode
+ )
+ if "panoptic_seg" in predictions:
+ panoptic_seg, segments_info = predictions["panoptic_seg"]
+ vis_output = visualizer.draw_panoptic_seg_predictions(
+ panoptic_seg.to(self.cpu_device), segments_info
+ )
+ else:
+ if "sem_seg" in predictions:
+ vis_output = visualizer.draw_sem_seg(
+ predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+ )
+ if "instances" in predictions:
+ instances = predictions["instances"].to(self.cpu_device)
+ vis_output = visualizer.draw_instance_predictions(
+ predictions=instances
+ )
+
+ return all_predictions, vis_output
+
+
+def get_seg_parser():
+ parser = argparse.ArgumentParser(description="Detectron2 demo for builtin models")
+ parser.add_argument(
+ "--config-file",
+ default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
+ metavar="FILE",
+ help="path to config file",
+ )
+ parser.add_argument(
+ "--webcam", action="store_true", help="Take inputs from webcam."
+ )
+ parser.add_argument("--video-input", help="Path to video file.")
+ parser.add_argument(
+ "--input", nargs="+", help="A list of space separated input images"
+ )
+ parser.add_argument(
+ "--output",
+ help="A file or directory to save output visualizations. "
+ "If not given, will show output in an OpenCV window.",
+ )
+ parser.add_argument(
+ "--vocabulary",
+ default="lvis",
+ choices=["lvis", "openimages", "objects365", "coco", "custom"],
+ help="",
+ )
+ parser.add_argument(
+ "--custom_vocabulary",
+ default="",
+ help="",
+ )
+ parser.add_argument(
+ "--confidence-threshold",
+ type=float,
+ default=0.1,
+ help="Minimum score for instance predictions to be shown",
+ )
+ parser.add_argument(
+ "--opts",
+ help="Modify config options using the command-line 'KEY VALUE' pairs",
+ default=[],
+ nargs=argparse.REMAINDER,
+ )
+ return parser
+
+
+class BatchPredictor:
+ """
+ Create a simple end-to-end predictor with the given config that runs on
+ single device for a list of input images.
+
+ Compared to using the model directly, this class does the following
+ additions:
+
+ 1. Load checkpoint from `cfg.MODEL.WEIGHTS`.
+ 2. Always take BGR image as the input and apply conversion defined by
+ `cfg.INPUT.FORMAT`.
+ 3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`.
+ 4. Take a list of input images
+
+ Attributes:
+ metadata (Metadata): the metadata of the underlying dataset, obtained
+ from cfg.DATASETS.TEST.
+
+ """
+
+ def __init__(self, cfg):
+ self.cfg = cfg.clone() # cfg can be modified by model
+ self.model = build_model(self.cfg)
+ self.model.eval()
+ self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
+
+ checkpointer = DetectionCheckpointer(self.model)
+ checkpointer.load(cfg.MODEL.WEIGHTS)
+
+ self.input_format = cfg.INPUT.FORMAT
+ assert self.input_format in ["RGB", "BGR"], self.input_format
+
+ def __call__(self, image_list):
+ """
+ Args:
+ image_list (list of np.ndarray): a list of images of
+ shape (H, W, C) (in BGR order).
+
+ Returns:
+ predictions (dict):
+ the output of the model for all images.
+ See :doc:`/tutorials/models` for details about the format.
+ """
+ inputs = []
+ for original_image in image_list:
+ # https://github.com/sphinx-doc/sphinx/issues/4258
+ # Apply pre-processing to image.
+ if self.input_format == "RGB":
+ # whether the model expects BGR inputs or RGB
+ original_image = original_image[:, :, ::-1]
+ height, width = original_image.shape[:2]
+ image = original_image
+ image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
+
+ instance = {"image": image, "height": height, "width": width}
+
+ inputs.append(instance)
+
+ with torch.no_grad():
+ predictions = self.model(inputs)
+ return predictions
diff --git a/semantic_exploration/agents/utils/fmm_planner.py b/semantic_exploration/agents/utils/fmm_planner.py
new file mode 100644
index 0000000..5256498
--- /dev/null
+++ b/semantic_exploration/agents/utils/fmm_planner.py
@@ -0,0 +1,153 @@
+import cv2
+import numpy as np
+import skfmm
+import skimage
+from numpy import ma
+
+
+def get_mask(sx, sy, scale, step_size):
+ size = int(step_size // scale) * 2 + 1
+ mask = np.zeros((size, size))
+ for i in range(size):
+ for j in range(size):
+ if ((i + 0.5) - (size // 2 + sx)) ** 2 + (
+ (j + 0.5) - (size // 2 + sy)
+ ) ** 2 <= step_size**2 and ((i + 0.5) - (size // 2 + sx)) ** 2 + (
+ (j + 0.5) - (size // 2 + sy)
+ ) ** 2 > (
+ step_size - 1
+ ) ** 2:
+ mask[i, j] = 1
+
+ mask[size // 2, size // 2] = 1
+ return mask
+
+
+def get_dist(sx, sy, scale, step_size):
+ size = int(step_size // scale) * 2 + 1
+ mask = np.zeros((size, size)) + 1e-10
+ for i in range(size):
+ for j in range(size):
+ if ((i + 0.5) - (size // 2 + sx)) ** 2 + (
+ (j + 0.5) - (size // 2 + sy)
+ ) ** 2 <= step_size**2:
+ mask[i, j] = max(
+ 5,
+ (
+ ((i + 0.5) - (size // 2 + sx)) ** 2
+ + ((j + 0.5) - (size // 2 + sy)) ** 2
+ )
+ ** 0.5,
+ )
+ return mask
+
+
+class FMMPlanner:
+ def __init__(self, traversible, scale=1, step_size=5):
+ self.scale = scale
+ self.step_size = step_size
+ if scale != 1.0:
+ self.traversible = cv2.resize(
+ traversible,
+ (traversible.shape[1] // scale, traversible.shape[0] // scale),
+ interpolation=cv2.INTER_NEAREST,
+ )
+ self.traversible = np.rint(self.traversible)
+ else:
+ self.traversible = traversible
+
+ self.du = int(self.step_size / (self.scale * 1.0))
+ self.fmm_dist = None
+
+ def set_goal(self, goal, auto_improve=False):
+ traversible_ma = ma.masked_values(self.traversible * 1, 0)
+ goal_x, goal_y = int(goal[0] / (self.scale * 1.0)), int(
+ goal[1] / (self.scale * 1.0)
+ )
+
+ if self.traversible[goal_x, goal_y] == 0.0 and auto_improve:
+ goal_x, goal_y = self._find_nearest_goal([goal_x, goal_y])
+
+ traversible_ma[goal_x, goal_y] = 0
+ dd = skfmm.distance(traversible_ma, dx=1)
+ dd = ma.filled(dd, np.max(dd) + 1)
+ self.fmm_dist = dd
+ return
+
+ def set_multi_goal(self, goal_map):
+ traversible_ma = ma.masked_values(self.traversible * 1, 0)
+ traversible_ma[goal_map == 1] = 0
+ dd = skfmm.distance(traversible_ma, dx=1)
+ dd = ma.filled(dd, np.max(dd) + 1)
+ self.fmm_dist = dd
+ return
+
+ def get_short_term_goal(self, state):
+ scale = self.scale * 1.0
+ state = [x / scale for x in state]
+ dx, dy = state[0] - int(state[0]), state[1] - int(state[1])
+ mask = get_mask(dx, dy, scale, self.step_size)
+ dist_mask = get_dist(dx, dy, scale, self.step_size)
+
+ state = [int(x) for x in state]
+
+ dist = np.pad(
+ self.fmm_dist,
+ self.du,
+ "constant",
+ constant_values=self.fmm_dist.shape[0] ** 2,
+ )
+ subset = dist[
+ state[0] : state[0] + 2 * self.du + 1, state[1] : state[1] + 2 * self.du + 1
+ ]
+
+ assert (
+ subset.shape[0] == 2 * self.du + 1 and subset.shape[1] == 2 * self.du + 1
+ ), "Planning error: unexpected subset shape {}".format(subset.shape)
+
+ subset *= mask
+ subset += (1 - mask) * self.fmm_dist.shape[0] ** 2
+
+ if subset[self.du, self.du] < self.step_size: # < 0.25 * 100 / 5.: # 25cm
+ stop = True
+ else:
+ stop = False
+
+ subset -= subset[self.du, self.du]
+ ratio1 = subset / dist_mask
+ subset[ratio1 < -1.5] = 1
+
+ # Find the smallest number index
+ (stg_x, stg_y) = np.unravel_index(np.argmin(subset), subset.shape)
+
+ if subset[stg_x, stg_y] > -0.0001:
+ replan = True
+ else:
+ replan = False
+
+ return (
+ (stg_x + state[0] - self.du) * scale,
+ (stg_y + state[1] - self.du) * scale,
+ replan,
+ stop,
+ )
+
+ def _find_nearest_goal(self, goal):
+ traversible = (
+ skimage.morphology.binary_dilation(
+ np.zeros(self.traversible.shape), skimage.morphology.disk(2)
+ )
+ != True # noqa
+ )
+ traversible = traversible * 1.0
+ planner = FMMPlanner(traversible)
+ planner.set_goal(goal)
+
+ mask = self.traversible
+
+ dist_map = planner.fmm_dist * mask
+ dist_map[dist_map == 0] = dist_map.max()
+
+ goal = np.unravel_index(dist_map.argmin(), dist_map.shape)
+
+ return goal
diff --git a/semantic_exploration/agents/utils/owlvit_semantic_prediction.py b/semantic_exploration/agents/utils/owlvit_semantic_prediction.py
new file mode 100644
index 0000000..f40614b
--- /dev/null
+++ b/semantic_exploration/agents/utils/owlvit_semantic_prediction.py
@@ -0,0 +1,107 @@
+# The following code is largely borrowed from
+# https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py and
+# https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
+
+import sys
+from pathlib import Path
+ROOT_DETIC = str(Path(__file__).resolve().parent).split("third_party")[0]+"third_party/"
+sys.path.insert(0, ROOT_DETIC + "Detic/third_party/CenterNet2")
+sys.path.insert(0, ROOT_DETIC + "Detic")
+
+import argparse # noqa: E402
+import pathlib # noqa: E402
+import time # noqa: E402
+from pathlib import Path # noqa: E402
+
+import cv2 # noqa: E402
+import numpy as np # noqa: E402
+import torch # noqa: E402
+from third_party.semantic_exploration.constants import coco_categories, coco_categories_mapping # noqa: E402
+from PIL import Image # noqa: E402
+from transformers import OwlViTForObjectDetection, OwlViTProcessor # noqa: E402
+
+
+class SemanticPredOwlvit:
+ def __init__(self, config):
+ self.config = config
+ # Get the device
+ self.device = (
+ torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+ )
+ # Get the owlvit model
+ self.model = OwlViTForObjectDetection.from_pretrained(
+ "google/owlvit-base-patch32"
+ )
+ self.model.eval()
+ self.model.to(self.device)
+ # Define the prefix
+ self.prefix="an image of "
+ # Get the pretrained model
+ self.processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
+ # Get the meta info
+ labels = []
+ for _key in coco_categories:
+ labels.append(self.prefix+_key)
+ self.labels = [labels]
+ self.score_threshold = 0.15
+
+ def get_prediction(self, img):
+ img = img[:, :, ::-1]
+ # Process inputs
+ inputs = self.processor(text=self.labels, images=img, return_tensors="pt")
+ target_sizes = torch.Tensor([img.shape[:2]])
+
+ # Inference
+ with torch.no_grad():
+ outputs = self.model(**inputs)
+
+ # Convert outputs (bounding boxes and class logits) to COCO API
+ results = self.processor.post_process(
+ outputs=outputs, target_sizes=target_sizes
+ )
+
+ # Process the image
+ img_i = 0
+ boxes, scores, labels = (
+ results[img_i]["boxes"],
+ results[img_i]["scores"],
+ results[img_i]["labels"],
+ )
+ semantic_input = np.zeros((img.shape[0], img.shape[1], 16 + 1))
+ for box, score, label in zip(boxes, scores, labels):
+ # Get the location of the bounding box
+ if score >= self.score_threshold:
+ top_left_x, top_left_y, bottom_right_x, bottom_right_y = [
+ int(round(i, 0)) for i in box.tolist()
+ ]
+ semantic_input[
+ top_left_x:bottom_right_x, top_left_y:bottom_right_y, int(label)
+ ] = 1
+ if self.config.VISUALIZE is True and score >= self.score_threshold:
+ # Use this line code to add bounding box to the image
+ img = np.ascontiguousarray(img, dtype=np.uint8)
+ cv2.rectangle(
+ img,
+ (top_left_x, top_left_y),
+ (bottom_right_x, bottom_right_y),
+ (0, 0, 255),
+ 2,
+ )
+ cv2.putText(
+ img,
+ self.labels[0][int(label)],
+ (top_left_x, top_left_y - 10),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.9,
+ (0, 0, 255),
+ 2,
+ )
+
+ return semantic_input, img
+
+
+def compress_sem_map(sem_map):
+ c_map = np.zeros((sem_map.shape[1], sem_map.shape[2]))
+ for i in range(sem_map.shape[0]):
+ c_map[sem_map[i] > 0.0] = i + 1
+ return c_map
diff --git a/agents/utils/semantic_prediction.py b/semantic_exploration/agents/utils/semantic_prediction.py
similarity index 84%
rename from agents/utils/semantic_prediction.py
rename to semantic_exploration/agents/utils/semantic_prediction.py
index 3ce9675..3b70ee0 100644
--- a/agents/utils/semantic_prediction.py
+++ b/semantic_exploration/agents/utils/semantic_prediction.py
@@ -4,23 +4,20 @@
import argparse
import time
-
-import torch
+from pathlib import Path
+import detectron2.data.transforms as T
import numpy as np
-
+import torch
+from third_party.semantic_exploration.constants import coco_categories_mapping
+from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
-from detectron2.utils.logger import setup_logger
from detectron2.data.catalog import MetadataCatalog
from detectron2.modeling import build_model
-from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.utils.logger import setup_logger
from detectron2.utils.visualizer import ColorMode, Visualizer
-import detectron2.data.transforms as T
-from constants import coco_categories_mapping
-
-
-class SemanticPredMaskRCNN():
+class SemanticPredMaskRCNN:
def __init__(self, args):
self.segmentation_model = ImageSegmentation(args)
self.args = args
@@ -31,39 +28,48 @@ def get_prediction(self, img):
img = img[:, :, ::-1]
image_list.append(img)
seg_predictions, vis_output = self.segmentation_model.get_predictions(
- image_list, visualize=args.visualize == 2)
+ image_list, visualize=args.visualize == 2
+ )
if args.visualize == 2:
img = vis_output.get_image()
- semantic_input = np.zeros((img.shape[0], img.shape[1], 15 + 1))
+ semantic_input = np.zeros(
+ (img.shape[0], img.shape[1], 16 + 1)
+ ) # self.args.num_sem_categories )) #15 + 1))
for j, class_idx in enumerate(
- seg_predictions[0]['instances'].pred_classes.cpu().numpy()):
+ seg_predictions[0]["instances"].pred_classes.cpu().numpy()
+ ):
if class_idx in list(coco_categories_mapping.keys()):
idx = coco_categories_mapping[class_idx]
- obj_mask = seg_predictions[0]['instances'].pred_masks[j] * 1.
+ obj_mask = seg_predictions[0]["instances"].pred_masks[j] * 1.0
semantic_input[:, :, idx] += obj_mask.cpu().numpy()
-
+ # The shape of the semantic input is (480, 640, 17)
return semantic_input, img
def compress_sem_map(sem_map):
c_map = np.zeros((sem_map.shape[1], sem_map.shape[2]))
for i in range(sem_map.shape[0]):
- c_map[sem_map[i] > 0.] = i + 1
+ c_map[sem_map[i] > 0.0] = i + 1
return c_map
-class ImageSegmentation():
+class ImageSegmentation:
def __init__(self, args):
+ ROOT = str(Path(__file__).resolve().parent).split("third_party")[0]+"third_party/"
+ model_path = ROOT + "detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"
string_args = """
- --config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml
+ --config-file {}
--input input1.jpeg
--confidence-threshold {}
--opts MODEL.WEIGHTS
detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl
- """.format(args.sem_pred_prob_thr)
+ """.format(
+ model_path,
+ args.sem_pred_prob_thr
+ )
if args.sem_gpu_id == -2:
string_args += """ MODEL.DEVICE cpu"""
@@ -71,7 +77,6 @@ def __init__(self, args):
string_args += """ MODEL.DEVICE cuda:{}""".format(args.sem_gpu_id)
string_args = string_args.split()
-
args = get_seg_parser().parse_args(string_args)
logger = setup_logger()
logger.info("Arguments: " + str(args))
@@ -91,15 +96,15 @@ def setup_cfg(args):
# Set score_threshold for builtin models
cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
- cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = \
+ cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = (
args.confidence_threshold
+ )
cfg.freeze()
return cfg
def get_seg_parser():
- parser = argparse.ArgumentParser(
- description="Detectron2 demo for builtin models")
+ parser = argparse.ArgumentParser(description="Detectron2 demo for builtin models")
parser.add_argument(
"--config-file",
default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
@@ -107,14 +112,12 @@ def get_seg_parser():
help="path to config file",
)
parser.add_argument(
- "--webcam",
- action="store_true",
- help="Take inputs from webcam.")
+ "--webcam", action="store_true", help="Take inputs from webcam."
+ )
parser.add_argument("--video-input", help="Path to video file.")
parser.add_argument(
- "--input",
- nargs="+",
- help="A list of space separated input images")
+ "--input", nargs="+", help="A list of space separated input images"
+ )
parser.add_argument(
"--output",
help="A file or directory to save output visualizations. "
@@ -124,7 +127,7 @@ def get_seg_parser():
parser.add_argument(
"--confidence-threshold",
type=float,
- default=0.5,
+ default=0.1,
help="Minimum score for instance predictions to be shown",
)
parser.add_argument(
@@ -169,7 +172,8 @@ def run_on_image(self, image_list, visualize=0):
predictions = all_predictions[0]
image = image_list[0]
visualizer = Visualizer(
- image, self.metadata, instance_mode=self.instance_mode)
+ image, self.metadata, instance_mode=self.instance_mode
+ )
if "panoptic_seg" in predictions:
panoptic_seg, segments_info = predictions["panoptic_seg"]
vis_output = visualizer.draw_panoptic_seg_predictions(
@@ -178,13 +182,13 @@ def run_on_image(self, image_list, visualize=0):
else:
if "sem_seg" in predictions:
vis_output = visualizer.draw_sem_seg(
- predictions["sem_seg"].argmax(
- dim=0).to(self.cpu_device)
+ predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
)
if "instances" in predictions:
instances = predictions["instances"].to(self.cpu_device)
vis_output = visualizer.draw_instance_predictions(
- predictions=instances)
+ predictions=instances
+ )
return all_predictions, vis_output
diff --git a/agents/utils/visualization.py b/semantic_exploration/agents/utils/visualization.py
similarity index 61%
rename from agents/utils/visualization.py
rename to semantic_exploration/agents/utils/visualization.py
index 16b3d40..a269bc5 100644
--- a/agents/utils/visualization.py
+++ b/semantic_exploration/agents/utils/visualization.py
@@ -4,14 +4,16 @@
def get_contour_points(pos, origin, size=20):
x, y, o = pos
- pt1 = (int(x) + origin[0],
- int(y) + origin[1])
- pt2 = (int(x + size / 1.5 * np.cos(o + np.pi * 4 / 3)) + origin[0],
- int(y + size / 1.5 * np.sin(o + np.pi * 4 / 3)) + origin[1])
- pt3 = (int(x + size * np.cos(o)) + origin[0],
- int(y + size * np.sin(o)) + origin[1])
- pt4 = (int(x + size / 1.5 * np.cos(o - np.pi * 4 / 3)) + origin[0],
- int(y + size / 1.5 * np.sin(o - np.pi * 4 / 3)) + origin[1])
+ pt1 = (int(x) + origin[0], int(y) + origin[1])
+ pt2 = (
+ int(x + size / 1.5 * np.cos(o + np.pi * 4 / 3)) + origin[0],
+ int(y + size / 1.5 * np.sin(o + np.pi * 4 / 3)) + origin[1],
+ )
+ pt3 = (int(x + size * np.cos(o)) + origin[0], int(y + size * np.sin(o)) + origin[1])
+ pt4 = (
+ int(x + size / 1.5 * np.cos(o - np.pi * 4 / 3)) + origin[0],
+ int(y + size / 1.5 * np.sin(o - np.pi * 4 / 3)) + origin[1],
+ )
return np.array([pt1, pt2, pt3, pt4])
@@ -20,7 +22,7 @@ def draw_line(start, end, mat, steps=25, w=1):
for i in range(steps + 1):
x = int(np.rint(start[0] + (end[0] - start[0]) * i / steps))
y = int(np.rint(start[1] + (end[1] - start[1]) * i / steps))
- mat[x - w:x + w, y - w:y + w] = 1
+ mat[x - w : x + w, y - w : y + w] = 1
return mat
@@ -35,17 +37,17 @@ def init_vis_image(goal_name, legend):
textsize = cv2.getTextSize(text, font, fontScale, thickness)[0]
textX = (640 - textsize[0]) // 2 + 15
textY = (50 + textsize[1]) // 2
- vis_image = cv2.putText(vis_image, text, (textX, textY),
- font, fontScale, color, thickness,
- cv2.LINE_AA)
+ vis_image = cv2.putText(
+ vis_image, text, (textX, textY), font, fontScale, color, thickness, cv2.LINE_AA
+ )
text = "Predicted Semantic Map"
textsize = cv2.getTextSize(text, font, fontScale, thickness)[0]
textX = 640 + (480 - textsize[0]) // 2 + 30
textY = (50 + textsize[1]) // 2
- vis_image = cv2.putText(vis_image, text, (textX, textY),
- font, fontScale, color, thickness,
- cv2.LINE_AA)
+ vis_image = cv2.putText(
+ vis_image, text, (textX, textY), font, fontScale, color, thickness, cv2.LINE_AA
+ )
# draw outlines
color = [100, 100, 100]
@@ -60,6 +62,6 @@ def init_vis_image(goal_name, legend):
# draw legend
lx, ly, _ = legend.shape
- vis_image[537:537 + lx, 155:155 + ly, :] = legend
+ vis_image[537 : 537 + lx, 155 : 155 + ly, :] = legend
return vis_image
diff --git a/semantic_exploration/constants.py b/semantic_exploration/constants.py
new file mode 100644
index 0000000..ac92b95
--- /dev/null
+++ b/semantic_exploration/constants.py
@@ -0,0 +1,155 @@
+scenes = {}
+scenes["train"] = [
+ "Allensville",
+ "Beechwood",
+ "Benevolence",
+ "Coffeen",
+ "Cosmos",
+ "Forkland",
+ "Hanson",
+ "Hiteman",
+ "Klickitat",
+ "Lakeville",
+ "Leonardo",
+ "Lindenwood",
+ "Marstons",
+ "Merom",
+ "Mifflinburg",
+ "Newfields",
+ "Onaga",
+ "Pinesdale",
+ "Pomaria",
+ "Ranchester",
+ "Shelbyville",
+ "Stockman",
+ "Tolstoy",
+ "Wainscott",
+ "Woodbine",
+]
+
+scenes["val"] = [
+ "Collierville",
+ "Corozal",
+ "Darden",
+ "Markleeville",
+ "Wiconisco",
+]
+
+coco_categories = {
+ "chair": 0,
+ "couch": 1,
+ "potted plant": 2,
+ "bed": 3,
+ "toilet": 4,
+ "tv": 5,
+ "dining-table": 6,
+ "oven": 7,
+ "sink": 8,
+ "refrigerator": 9,
+ "book": 10,
+ "clock": 11,
+ "vase": 12,
+ "cup": 13,
+ "bottle": 14,
+}
+
+coco_categories_replica = {
+ "chair": 0,
+ "sofa": 1,
+ "plant": 2,
+ "bed": 3,
+ "toilet": 4,
+ "tv": 5,
+ "table": 6,
+ "oven": 7,
+ "sink": 8,
+ "fridge": 9,
+ "book": 10,
+ "clock": 11,
+ "vase": 12,
+ "cup": 13,
+ "bottle": 14,
+ "person": 15,
+}
+
+coco_categories_mapping = {
+ 56: 0, # chair
+ 57: 1, # couch
+ 58: 2, # potted plant
+ 59: 3, # bed
+ 61: 4, # toilet
+ 62: 5, # tv
+ 60: 6, # dining-table
+ 69: 7, # oven
+ 71: 8, # sink
+ 72: 9, # refrigerator
+ 73: 10, # book
+ 74: 11, # clock
+ 75: 12, # vase
+ 41: 13, # cup
+ 39: 14, # bottle
+ 0: 15, # person
+}
+
+color_palette = [
+ 1.0,
+ 1.0,
+ 1.0,
+ 0.6,
+ 0.6,
+ 0.6,
+ 0.95,
+ 0.95,
+ 0.95,
+ 0.96,
+ 0.36,
+ 0.26,
+ 0.12156862745098039,
+ 0.47058823529411764,
+ 0.7058823529411765,
+ 0.9400000000000001,
+ 0.7818,
+ 0.66,
+ 0.9400000000000001,
+ 0.8868,
+ 0.66,
+ 0.8882000000000001,
+ 0.9400000000000001,
+ 0.66,
+ 0.7832000000000001,
+ 0.9400000000000001,
+ 0.66,
+ 0.6782000000000001,
+ 0.9400000000000001,
+ 0.66,
+ 0.66,
+ 0.9400000000000001,
+ 0.7468000000000001,
+ 0.66,
+ 0.9400000000000001,
+ 0.8518000000000001,
+ 0.66,
+ 0.9232,
+ 0.9400000000000001,
+ 0.66,
+ 0.8182,
+ 0.9400000000000001,
+ 0.66,
+ 0.7132,
+ 0.9400000000000001,
+ 0.7117999999999999,
+ 0.66,
+ 0.9400000000000001,
+ 0.8168,
+ 0.66,
+ 0.9400000000000001,
+ 0.9218,
+ 0.66,
+ 0.9400000000000001,
+ 0.9400000000000001,
+ 0.66,
+ 0.8531999999999998,
+ 0.9400000000000001,
+ 0.66,
+ 0.748199999999999,
+]
diff --git a/docs/DOCKER_INSTRUCTIONS.md b/semantic_exploration/docs/DOCKER_INSTRUCTIONS.md
similarity index 100%
rename from docs/DOCKER_INSTRUCTIONS.md
rename to semantic_exploration/docs/DOCKER_INSTRUCTIONS.md
diff --git a/docs/INSTRUCTIONS.md b/semantic_exploration/docs/INSTRUCTIONS.md
similarity index 100%
rename from docs/INSTRUCTIONS.md
rename to semantic_exploration/docs/INSTRUCTIONS.md
diff --git a/docs/example.gif b/semantic_exploration/docs/example.gif
similarity index 100%
rename from docs/example.gif
rename to semantic_exploration/docs/example.gif
diff --git a/docs/legend.png b/semantic_exploration/docs/legend.png
similarity index 100%
rename from docs/legend.png
rename to semantic_exploration/docs/legend.png
diff --git a/docs/overview.jpg b/semantic_exploration/docs/overview.jpg
similarity index 100%
rename from docs/overview.jpg
rename to semantic_exploration/docs/overview.jpg
diff --git a/envs/__init__.py b/semantic_exploration/envs/__init__.py
similarity index 80%
rename from envs/__init__.py
rename to semantic_exploration/envs/__init__.py
index 2098b62..9ea7d02 100755
--- a/envs/__init__.py
+++ b/semantic_exploration/envs/__init__.py
@@ -1,11 +1,9 @@
import torch
-from .habitat import construct_envs
-
-def make_vec_envs(args):
- envs = construct_envs(args)
- envs = VecPyTorch(envs, args.device)
+def make_vec_envs(args, is_slurm=False, is_eval=False):
+ envs, num_envs = construct_envs(args, is_slurm, is_eval)
+ envs = VecPyTorch(envs, num_envs, args.device)
return envs
@@ -13,11 +11,11 @@ def make_vec_envs(args):
# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/envs.py#L159
class VecPyTorch():
- def __init__(self, venv, device):
+ def __init__(self, venv, num_envs, device):
self.venv = venv
- self.num_envs = venv.num_envs
- self.observation_space = venv.observation_space
- self.action_space = venv.action_space
+ self.num_envs = num_envs
+ # self.observation_space = venv.observation_space
+ # self.action_space = venv.action_space
self.device = device
def reset(self):
diff --git a/envs/utils/depth_utils.py b/semantic_exploration/envs/utils/depth_utils.py
similarity index 99%
rename from envs/utils/depth_utils.py
rename to semantic_exploration/envs/utils/depth_utils.py
index afe98e2..a6c430a 100644
--- a/envs/utils/depth_utils.py
+++ b/semantic_exploration/envs/utils/depth_utils.py
@@ -21,7 +21,7 @@
import numpy as np
import torch
-import envs.utils.rotation_utils as ru
+import third_party.semantic_exploration.envs.utils.rotation_utils as ru
def get_camera_matrix(width, height, fov):
diff --git a/envs/utils/fmm_planner.py b/semantic_exploration/envs/utils/fmm_planner.py
similarity index 90%
rename from envs/utils/fmm_planner.py
rename to semantic_exploration/envs/utils/fmm_planner.py
index c2fd0bd..82bce12 100644
--- a/envs/utils/fmm_planner.py
+++ b/semantic_exploration/envs/utils/fmm_planner.py
@@ -37,7 +37,15 @@ def get_dist(sx, sy, scale, step_size):
class FMMPlanner():
- def __init__(self, traversible, scale=1, step_size=5):
+ def __init__(self, traversible, scale=1, step_size=25):
+ """
+ Arguments:
+ traversible: (M + 1, M + 1) binary map encoding traversible regions
+ scale: map scale
+ step_size: maximum distance of the short-term goal selected by the
+ planner
+ vis_dir: folder where to dump visualization
+ """
self.scale = scale
self.step_size = step_size
if scale != 1.:
@@ -95,7 +103,7 @@ def get_short_term_goal(self, state):
subset *= mask
subset += (1 - mask) * self.fmm_dist.shape[0] ** 2
- if subset[self.du, self.du] < 0.25 * 100 / 5.: # 25cm
+ if subset[self.du, self.du] < self.step_size: #< 0.25 * 100 / 5.: # 25cm
stop = True
else:
stop = False
@@ -104,6 +112,7 @@ def get_short_term_goal(self, state):
ratio1 = subset / dist_mask
subset[ratio1 < -1.5] = 1
+ # Find the smallest number index
(stg_x, stg_y) = np.unravel_index(np.argmin(subset), subset.shape)
if subset[stg_x, stg_y] > -0.0001:
diff --git a/envs/utils/map_builder.py b/semantic_exploration/envs/utils/map_builder.py
similarity index 100%
rename from envs/utils/map_builder.py
rename to semantic_exploration/envs/utils/map_builder.py
diff --git a/envs/utils/pose.py b/semantic_exploration/envs/utils/pose.py
similarity index 100%
rename from envs/utils/pose.py
rename to semantic_exploration/envs/utils/pose.py
diff --git a/envs/utils/rotation_utils.py b/semantic_exploration/envs/utils/rotation_utils.py
similarity index 100%
rename from envs/utils/rotation_utils.py
rename to semantic_exploration/envs/utils/rotation_utils.py
diff --git a/semantic_exploration/models/__init__.py b/semantic_exploration/models/__init__.py
new file mode 100644
index 0000000..e50ca4e
--- /dev/null
+++ b/semantic_exploration/models/__init__.py
@@ -0,0 +1,3 @@
+from third_party.semantic_exploration.models.owlvit import OwlVit
+from third_party.semantic_exploration.models.semantic_map import Semantic_Mapping
+from third_party.semantic_exploration.models.sentence_similarity import SentenceSimilarity
diff --git a/semantic_exploration/models/owlvit.py b/semantic_exploration/models/owlvit.py
new file mode 100644
index 0000000..7b4e939
--- /dev/null
+++ b/semantic_exploration/models/owlvit.py
@@ -0,0 +1,241 @@
+# mypy: ignore-errors
+import argparse
+import time
+
+import cv2
+import torch
+from PIL import Image
+from transformers import OwlViTForObjectDetection, OwlViTProcessor
+
+
+class OwlVit:
+ def __init__(self, labels, score_threshold, show_img):
+ # self.device = torch.device('cpu')
+ self.device = (
+ torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+ )
+
+ self.model = OwlViTForObjectDetection.from_pretrained(
+ "google/owlvit-base-patch32"
+ )
+ self.model.eval()
+ self.model.to(self.device)
+
+ self.processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
+
+ self.labels = labels
+ self.score_threshold = score_threshold
+ self.show_img = show_img
+
+ def run_inference(self, img):
+ """
+ img: an open cv image in (H, W, C) format
+ """
+ # Process inputs
+ # img = img.to(self.device)
+ inputs = self.processor(text=self.labels, images=img, return_tensors="pt")
+
+ # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
+ # target_sizes = torch.Tensor([img.size[::-1]]) this is for PIL images
+ target_sizes = torch.Tensor([img.shape[:2]]).to(self.device)
+ inputs = inputs.to(self.device)
+
+ # Inference
+ with torch.no_grad():
+ outputs = self.model(**inputs)
+
+ # Convert outputs (bounding boxes and class logits) to COCO API
+ results = self.processor.post_process(
+ outputs=outputs, target_sizes=target_sizes
+ )
+ # img = img.to('cpu')
+
+ if self.show_img:
+ self.show_img_with_overlaid_bounding_boxes(img, results)
+
+ return self.get_most_confident_bounding_box_per_label(results)
+
+ def run_inference_and_return_img(self, img):
+ """
+ img: an open cv image in (H, W, C) format
+ """
+ # img = img.to(self.device)
+
+ inputs = self.processor(text=self.labels, images=img, return_tensors="pt")
+ target_sizes = torch.Tensor([img.shape[:2]]).to(self.device)
+ inputs = inputs.to(self.device)
+ # Inference
+ with torch.no_grad():
+ outputs = self.model(**inputs)
+
+ # Convert outputs (bounding boxes and class logits) to COCO API
+ results = self.processor.post_process(
+ outputs=outputs, target_sizes=target_sizes
+ )
+ # img = img.to('cpu')
+ # if self.show_img:
+ # self.show_img_with_overlaid_bounding_boxes(img, results)
+
+ return self.get_most_confident_bounding_box_per_label(
+ results
+ ), self.create_img_with_bounding_box(img, results)
+
+ def show_img_with_overlaid_bounding_boxes(self, img, results):
+ img = self.create_img_with_bounding_box(img, results)
+ cv2.imshow("img", img)
+ cv2.waitKey(1)
+
+ def get_bounding_boxes(self, results):
+ """
+ Returns all bounding boxes with a score above the threshold
+ """
+ boxes, scores, labels = (
+ results[0]["boxes"],
+ results[0]["scores"],
+ results[0]["labels"],
+ )
+ boxes = boxes.to("cpu")
+ labels = labels.to("cpu")
+ scores = scores.to("cpu")
+
+ target_boxes = []
+ for box, score, label in zip(boxes, scores, labels):
+ box = [round(i, 2) for i in box.tolist()]
+ if score >= self.score_threshold:
+ target_boxes.append([self.labels[0][label.item()], score.item(), box])
+
+ return target_boxes
+
+ def get_most_confident_bounding_box(self, results):
+ """
+ Returns the most confident bounding box
+ """
+ boxes, scores, labels = (
+ results[0]["boxes"],
+ results[0]["scores"],
+ results[0]["labels"],
+ )
+ boxes = boxes.to("cpu")
+ labels = labels.to("cpu")
+ scores = scores.to("cpu")
+
+ target_box = []
+ target_score = -float("inf")
+
+ for box, score, label in zip(boxes, scores, labels):
+ box = [round(i, 2) for i in box.tolist()]
+ if score >= self.score_threshold:
+ if score > target_score:
+ target_score = score
+ target_box = box
+
+ if target_score == -float("inf"):
+ return None
+ else:
+ x1 = int(target_box[0])
+ y1 = int(target_box[1])
+ x2 = int(target_box[2])
+ y2 = int(target_box[3])
+
+ print("location:", x1, y1, x2, y2)
+ return x1, y1, x2, y2
+
+ def get_most_confident_bounding_box_per_label(self, results):
+ """
+ Returns the most confident bounding box for each label above the threshold
+ """
+ boxes, scores, labels = (
+ results[0]["boxes"],
+ results[0]["scores"],
+ results[0]["labels"],
+ )
+ boxes = boxes.to("cpu")
+ labels = labels.to("cpu")
+ scores = scores.to("cpu")
+
+ # Initialize dictionaries to store most confident bounding boxes and scores per label
+ target_boxes = {}
+ target_scores = {}
+
+ for box, score, label in zip(boxes, scores, labels):
+ box = [round(i, 2) for i in box.tolist()]
+ if score >= self.score_threshold:
+ # If the current score is higher than the stored score for this label, update the target box and score
+ if (
+ label.item() not in target_scores
+ or score > target_scores[label.item()]
+ ):
+ target_scores[label.item()] = score.item()
+ target_boxes[label.item()] = box
+
+ # Format the output
+ result = []
+ for label, box in target_boxes.items():
+ x1 = int(box[0])
+ y1 = int(box[1])
+ x2 = int(box[2])
+ y2 = int(box[3])
+
+ result.append(
+ [self.labels[0][label], target_scores[label], [x1, y1, x2, y2]]
+ )
+
+ return result
+
+ def create_img_with_bounding_box(self, img, results):
+ """
+ Returns an image with all bounding boxes avove the threshold overlaid
+ """
+
+ results = self.get_most_confident_bounding_box_per_label(results)
+ font = cv2.FONT_HERSHEY_SIMPLEX
+
+ for label, score, box in results:
+ img = cv2.rectangle(img, box[:2], box[2:], (255, 0, 0), 5)
+ if box[3] + 25 > 768:
+ y = box[3] - 10
+ else:
+ y = box[3] + 25
+ img = cv2.putText(
+ img, label, (box[0], y), font, 1, (255, 0, 0), 2, cv2.LINE_AA
+ )
+
+ return img
+
+ def update_label(self, labels):
+ self.labels = labels
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--file",
+ type=str,
+ default="/home/akshara/spot/spot_rl_experiments/spot_rl/grasp_visualizations/1650841878.2699108.png",
+ )
+ parser.add_argument("--score_threshold", type=float, default=0.1)
+ parser.add_argument("--show_img", type=bool, default=True)
+ parser.add_argument(
+ "--labels",
+ type=list,
+ default=[
+ [
+ "lion plush",
+ "penguin plush",
+ "teddy bear",
+ "bear plush",
+ "caterpilar plush",
+ "ball plush",
+ "rubiks cube",
+ ]
+ ],
+ )
+ args = parser.parse_args()
+
+ file = args.file
+ img = cv2.imread(file)
+
+ V = OwlVit(args.labels, args.score_threshold, args.show_img)
+ results = V.run_inference(img)
+ # Keep the window open for 10 seconds
+ time.sleep(10)
diff --git a/semantic_exploration/models/semantic_map.py b/semantic_exploration/models/semantic_map.py
new file mode 100644
index 0000000..1b866d0
--- /dev/null
+++ b/semantic_exploration/models/semantic_map.py
@@ -0,0 +1,240 @@
+import numpy as np
+import third_party.semantic_exploration.envs.utils.depth_utils as du
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+
+class ChannelPool(nn.MaxPool1d):
+ def forward(self, x):
+ n, c, w, h = x.size()
+ x = x.view(n, c, w * h).permute(0, 2, 1)
+ x = x.contiguous()
+ pooled = F.max_pool1d(x, c, 1)
+ _, _, c = pooled.size()
+ pooled = pooled.permute(0, 2, 1)
+ return pooled.view(n, c, w, h)
+
+
+class Semantic_Mapping(nn.Module):
+
+ """
+ Semantic_Mapping
+ """
+
+ def __init__(self, config):
+ super(Semantic_Mapping, self).__init__()
+
+ self.device = config.DEVICE
+ self.screen_h = config.FRAME_HEIGHT
+ self.screen_w = config.FRAME_WIDTH
+ self.resolution = config.MAP_RESOLUTION
+ self.z_resolution = config.MAP_RESOLUTION
+ self.map_size_cm = config.MAP_SIZE_CM // config.GLOBAL_DOWNSCALING
+ self.n_channels = 3
+ self.vision_range = config.VISION_RANGE
+ self.dropout = 0.5
+ self.fov = config.HFOV
+ self.du_scale = config.DU_SCALE
+ self.cat_pred_threshold = config.CAT_PRED_THRESHOLD
+ self.exp_pred_threshold = config.EXP_PRED_THRESHOLD
+ self.map_pred_threshold = config.MAP_PRED_THRESHOLD
+ self.num_sem_categories = config.NUM_SEM_CATEGORIES
+
+ self.max_height = int(180 / self.z_resolution)
+ self.min_height = int(-40 / self.z_resolution)
+ self.agent_height = config.CAMERA_HEIGHT * 100.0
+ self.shift_loc = [self.vision_range * self.resolution // 2, 0, np.pi / 2.0]
+ self.camera_matrix = du.get_camera_matrix(
+ self.screen_w, self.screen_h, self.fov
+ )
+
+ self.pool = ChannelPool(1)
+
+ vr = self.vision_range
+
+ self.init_grid = (
+ torch.zeros(
+ config.NUM_PROCESSES,
+ 1 + self.num_sem_categories,
+ vr,
+ vr,
+ self.max_height - self.min_height,
+ )
+ .float()
+ .to(self.device)
+ )
+ self.feat = (
+ torch.ones(
+ config.NUM_PROCESSES,
+ 1 + self.num_sem_categories,
+ self.screen_h // self.du_scale * self.screen_w // self.du_scale,
+ )
+ .float()
+ .to(self.device)
+ )
+
+ def forward(self, obs, pose_obs, maps_last, poses_last):
+ bs, c, h, w = obs.size()
+ depth = obs[:, 3, :, :]
+ point_cloud_t = du.get_point_cloud_from_z_t(
+ depth, self.camera_matrix, self.device, scale=self.du_scale
+ )
+
+ agent_view_t = du.transform_camera_view_t(
+ point_cloud_t, self.agent_height, 0, self.device
+ )
+
+ agent_view_centered_t = du.transform_pose_t(
+ agent_view_t, self.shift_loc, self.device
+ )
+
+ max_h = self.max_height
+ min_h = self.min_height
+ xy_resolution = self.resolution
+ z_resolution = self.z_resolution
+ vision_range = self.vision_range
+ XYZ_cm_std = agent_view_centered_t.float()
+ XYZ_cm_std[..., :2] = XYZ_cm_std[..., :2] / xy_resolution
+ XYZ_cm_std[..., :2] = (
+ (XYZ_cm_std[..., :2] - vision_range // 2.0) / vision_range * 2.0
+ )
+ XYZ_cm_std[..., 2] = XYZ_cm_std[..., 2] / z_resolution
+ XYZ_cm_std[..., 2] = (
+ (XYZ_cm_std[..., 2] - (max_h + min_h) // 2.0) / (max_h - min_h) * 2.0
+ )
+ self.feat[:, 1:, :] = nn.AvgPool2d(self.du_scale)(obs[:, 4:, :, :]).view(
+ bs, c - 4, h // self.du_scale * w // self.du_scale
+ )
+
+ XYZ_cm_std = XYZ_cm_std.permute(0, 3, 1, 2)
+ XYZ_cm_std = XYZ_cm_std.view(
+ XYZ_cm_std.shape[0],
+ XYZ_cm_std.shape[1],
+ XYZ_cm_std.shape[2] * XYZ_cm_std.shape[3],
+ )
+
+ voxels = du.splat_feat_nd(
+ self.init_grid * 0.0, self.feat, XYZ_cm_std
+ ).transpose(2, 3)
+
+ min_z = int(25 / z_resolution - min_h)
+ max_z = int((self.agent_height + 1) / z_resolution - min_h)
+
+ agent_height_proj = voxels[..., min_z:max_z].sum(4)
+ all_height_proj = voxels.sum(4)
+
+ fp_map_pred = agent_height_proj[:, 0:1, :, :]
+ fp_exp_pred = all_height_proj[:, 0:1, :, :]
+ fp_map_pred = fp_map_pred / self.map_pred_threshold
+ fp_exp_pred = fp_exp_pred / self.exp_pred_threshold
+ fp_map_pred = torch.clamp(fp_map_pred, min=0.0, max=1.0)
+ fp_exp_pred = torch.clamp(fp_exp_pred, min=0.0, max=1.0)
+
+ pose_pred = poses_last
+
+ agent_view = torch.zeros(
+ bs,
+ c,
+ self.map_size_cm // self.resolution,
+ self.map_size_cm // self.resolution,
+ ).to(self.device)
+
+ x1 = self.map_size_cm // (self.resolution * 2) - self.vision_range // 2
+ x2 = x1 + self.vision_range
+ y1 = self.map_size_cm // (self.resolution * 2)
+ y2 = y1 + self.vision_range
+
+ agent_view[:, 0:1, y1:y2, x1:x2] = fp_map_pred
+ agent_view[:, 1:2, y1:y2, x1:x2] = fp_exp_pred
+ agent_view[:, 4:, y1:y2, x1:x2] = torch.clamp(
+ agent_height_proj[:, 1:, :, :] / self.cat_pred_threshold, min=0.0, max=1.0
+ )
+
+ corrected_pose = pose_obs
+
+ def get_new_pose_batch(pose, rel_pose_change):
+
+ pose[:, 1] += rel_pose_change[:, 0] * torch.sin(
+ pose[:, 2] / 57.29577951308232
+ ) + rel_pose_change[:, 1] * torch.cos(pose[:, 2] / 57.29577951308232)
+ pose[:, 0] += rel_pose_change[:, 0] * torch.cos(
+ pose[:, 2] / 57.29577951308232
+ ) - rel_pose_change[:, 1] * torch.sin(pose[:, 2] / 57.29577951308232)
+ pose[:, 2] += rel_pose_change[:, 2] * 57.29577951308232
+
+ pose[:, 2] = torch.fmod(pose[:, 2] - 180.0, 360.0) + 180.0
+ pose[:, 2] = torch.fmod(pose[:, 2] + 180.0, 360.0) - 180.0
+
+ return pose
+
+ current_poses = get_new_pose_batch(poses_last, corrected_pose)
+ st_pose = current_poses.clone().detach()
+
+ st_pose[:, :2] = -(
+ st_pose[:, :2] * 100.0 / self.resolution
+ - self.map_size_cm // (self.resolution * 2)
+ ) / (self.map_size_cm // (self.resolution * 2))
+ st_pose[:, 2] = 90.0 - (st_pose[:, 2])
+
+ rot_mat, trans_mat = self.get_grid(st_pose, agent_view.size(), self.device)
+
+ rotated = F.grid_sample(agent_view, rot_mat, align_corners=True)
+ translated = F.grid_sample(rotated, trans_mat, align_corners=True)
+
+ # Remove people in the last map if found new people
+ if translated[:, 19, :, :].sum() > 0.99:
+ maps_last[:, 19, :, :] = 0
+
+ maps2 = torch.cat((maps_last.unsqueeze(1), translated.unsqueeze(1)), 1)
+
+ map_pred, _ = torch.max(maps2, 1)
+
+ if np.sum(np.array(map_pred)[0, 1, :, :]) == 0:
+ import pdb
+
+ pdb.set_trace()
+
+ return fp_map_pred, map_pred, pose_pred, current_poses
+
+ @staticmethod
+ def get_grid(pose, grid_size, device):
+ """
+ Input:
+ `pose` FloatTensor(bs, 3)
+ `grid_size` 4-tuple (bs, _, grid_h, grid_w)
+ `device` torch.device (cpu or gpu)
+ Output:
+ `rot_grid` FloatTensor(bs, grid_h, grid_w, 2)
+ `trans_grid` FloatTensor(bs, grid_h, grid_w, 2)
+
+ """
+ pose = pose.float()
+ x = pose[:, 0]
+ y = pose[:, 1]
+ t = pose[:, 2]
+
+ t = t * np.pi / 180.0
+ cos_t = t.cos()
+ sin_t = t.sin()
+
+ theta11 = torch.stack(
+ [cos_t, -sin_t, torch.zeros(cos_t.shape).float().to(device)], 1
+ )
+ theta12 = torch.stack(
+ [sin_t, cos_t, torch.zeros(cos_t.shape).float().to(device)], 1
+ )
+ theta1 = torch.stack([theta11, theta12], 1)
+
+ theta21 = torch.stack(
+ [torch.ones(x.shape).to(device), -torch.zeros(x.shape).to(device), x], 1
+ )
+ theta22 = torch.stack(
+ [torch.zeros(x.shape).to(device), torch.ones(x.shape).to(device), y], 1
+ )
+ theta2 = torch.stack([theta21, theta22], 1)
+
+ rot_grid = F.affine_grid(theta1, torch.Size(grid_size))
+ trans_grid = F.affine_grid(theta2, torch.Size(grid_size))
+
+ return rot_grid, trans_grid
diff --git a/semantic_exploration/models/sentence_similarity.py b/semantic_exploration/models/sentence_similarity.py
new file mode 100644
index 0000000..f574a63
--- /dev/null
+++ b/semantic_exploration/models/sentence_similarity.py
@@ -0,0 +1,73 @@
+import torch
+import torch.nn.functional as F
+from transformers import AutoModel, AutoTokenizer
+
+
+class SentenceSimilarity:
+ def __init__(self):
+ # Load model from HuggingFace Hub
+ self.tokenizer = AutoTokenizer.from_pretrained(
+ "sentence-transformers/all-MiniLM-L6-v2"
+ )
+ self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+
+ def mean_pooling(self, model_output, attention_mask):
+ # Mean Pooling - Take attention mask into account for correct averaging
+
+ token_embeddings = model_output[
+ 0
+ ] # First element of model_output contains all token embeddings
+ input_mask_expanded = (
+ attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+ )
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+ input_mask_expanded.sum(1), min=1e-9
+ )
+
+ def get_similarity_two_sentences(self, a, b):
+ sentences = [a, b]
+
+ # Tokenize sentences
+ encoded_input = self.tokenizer(
+ sentences, padding=True, truncation=True, return_tensors="pt"
+ )
+
+ # Compute token embeddings
+ with torch.no_grad():
+ model_output = self.model(**encoded_input)
+
+ # Perform pooling
+ sentence_embeddings = self.mean_pooling(
+ model_output, encoded_input["attention_mask"]
+ )
+
+ # Normalize embeddings
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+
+ # compute cosine similarity between embeddings
+ cosine_scores = sentence_embeddings[0] @ sentence_embeddings[1].T
+ return cosine_scores
+
+ def get_most_similar_in_list(self, query_word, list):
+ sentences = [query_word] + [word.replace("_", " ") for word in list]
+ encoded_input = self.tokenizer(
+ sentences, padding=True, truncation=True, return_tensors="pt"
+ )
+ with torch.no_grad():
+ model_output = self.model(**encoded_input)
+
+ # Perform pooling
+ sentence_embeddings = self.mean_pooling(
+ model_output, encoded_input["attention_mask"]
+ )
+
+ # Normalize embeddings
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+
+ # compute cosine similarity between embeddings
+ cosine_scores = sentence_embeddings[0] @ sentence_embeddings[1:].T
+ print(
+ f"word queried : {query_word} | word list : {list} | cosine scores : {cosine_scores}"
+ )
+
+ return list[torch.argmax(cosine_scores).item()]
diff --git a/test.py b/test.py
deleted file mode 100644
index 7a81a77..0000000
--- a/test.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import os
-import torch
-import numpy as np
-
-from envs import make_vec_envs
-from arguments import get_args
-
-os.environ["OMP_NUM_THREADS"] = "1"
-
-args = get_args()
-
-np.random.seed(args.seed)
-torch.manual_seed(args.seed)
-
-if args.cuda:
- torch.cuda.manual_seed(args.seed)
-
-
-def main():
- num_episodes = int(args.num_eval_episodes)
- args.device = torch.device("cuda:0" if args.cuda else "cpu")
-
- torch.set_num_threads(1)
- envs = make_vec_envs(args)
- obs, infos = envs.reset()
-
- for ep_num in range(num_episodes):
- for step in range(args.max_episode_length):
- action = torch.randint(0, 3, (args.num_processes,))
- obs, rew, done, infos = envs.step(action)
-
- if done:
- break
-
- print("Test successfully completed")
-
-
-if __name__ == "__main__":
- main()
diff --git a/utils/distributions.py b/utils/distributions.py
deleted file mode 100644
index cd025eb..0000000
--- a/utils/distributions.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# The following code is largely borrowed from:
-# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/distributions.py
-
-import torch
-import torch.nn as nn
-
-from utils.model import AddBias
-
-"""
-Modify standard PyTorch distributions so they are compatible with this code.
-"""
-
-FixedCategorical = torch.distributions.Categorical
-
-old_sample = FixedCategorical.sample
-FixedCategorical.sample = lambda self: old_sample(self)
-
-log_prob_cat = FixedCategorical.log_prob
-FixedCategorical.log_probs = lambda self, actions: \
- log_prob_cat(self, actions.squeeze(-1))
-FixedCategorical.mode = lambda self: self.probs.argmax(dim=1, keepdim=True)
-
-FixedNormal = torch.distributions.Normal
-log_prob_normal = FixedNormal.log_prob
-FixedNormal.log_probs = lambda self, actions: \
- log_prob_normal(self, actions).sum(-1, keepdim=False)
-
-entropy = FixedNormal.entropy
-FixedNormal.entropy = lambda self: entropy(self).sum(-1)
-
-FixedNormal.mode = lambda self: self.mean
-
-
-class Categorical(nn.Module):
-
- def __init__(self, num_inputs, num_outputs):
- super(Categorical, self).__init__()
- self.linear = nn.Linear(num_inputs, num_outputs)
-
- def forward(self, x):
- x = self.linear(x)
- return FixedCategorical(logits=x)
-
-
-class DiagGaussian(nn.Module):
-
- def __init__(self, num_inputs, num_outputs):
- super(DiagGaussian, self).__init__()
-
- self.fc_mean = nn.Linear(num_inputs, num_outputs)
- self.logstd = AddBias(torch.zeros(num_outputs))
-
- def forward(self, x):
- action_mean = self.fc_mean(x)
-
- zeros = torch.zeros(action_mean.size())
- if x.is_cuda:
- zeros = zeros.cuda()
-
- action_logstd = self.logstd(zeros)
- return FixedNormal(action_mean, action_logstd.exp())
diff --git a/utils/model.py b/utils/model.py
deleted file mode 100644
index e55b045..0000000
--- a/utils/model.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import numpy as np
-import torch
-from torch import nn
-from torch.nn import functional as F
-
-
-def get_grid(pose, grid_size, device):
- """
- Input:
- `pose` FloatTensor(bs, 3)
- `grid_size` 4-tuple (bs, _, grid_h, grid_w)
- `device` torch.device (cpu or gpu)
- Output:
- `rot_grid` FloatTensor(bs, grid_h, grid_w, 2)
- `trans_grid` FloatTensor(bs, grid_h, grid_w, 2)
-
- """
- pose = pose.float()
- x = pose[:, 0]
- y = pose[:, 1]
- t = pose[:, 2]
-
- bs = x.size(0)
- t = t * np.pi / 180.
- cos_t = t.cos()
- sin_t = t.sin()
-
- theta11 = torch.stack([cos_t, -sin_t,
- torch.zeros(cos_t.shape).float().to(device)], 1)
- theta12 = torch.stack([sin_t, cos_t,
- torch.zeros(cos_t.shape).float().to(device)], 1)
- theta1 = torch.stack([theta11, theta12], 1)
-
- theta21 = torch.stack([torch.ones(x.shape).to(device),
- -torch.zeros(x.shape).to(device), x], 1)
- theta22 = torch.stack([torch.zeros(x.shape).to(device),
- torch.ones(x.shape).to(device), y], 1)
- theta2 = torch.stack([theta21, theta22], 1)
-
- rot_grid = F.affine_grid(theta1, torch.Size(grid_size))
- trans_grid = F.affine_grid(theta2, torch.Size(grid_size))
-
- return rot_grid, trans_grid
-
-
-class ChannelPool(nn.MaxPool1d):
- def forward(self, x):
- n, c, w, h = x.size()
- x = x.view(n, c, w * h).permute(0, 2, 1)
- x = x.contiguous()
- pooled = F.max_pool1d(x, c, 1)
- _, _, c = pooled.size()
- pooled = pooled.permute(0, 2, 1)
- return pooled.view(n, c, w, h)
-
-
-# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/utils.py#L32
-class AddBias(nn.Module):
- def __init__(self, bias):
- super(AddBias, self).__init__()
- self._bias = nn.Parameter(bias.unsqueeze(1))
-
- def forward(self, x):
- if x.dim() == 2:
- bias = self._bias.t().view(1, -1)
- else:
- bias = self._bias.t().view(1, -1, 1, 1)
-
- return x + bias
-
-
-# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/model.py#L10
-class Flatten(nn.Module):
- def forward(self, x):
- return x.view(x.size(0), -1)
-
-
-# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/model.py#L82
-class NNBase(nn.Module):
-
- def __init__(self, recurrent, recurrent_input_size, hidden_size):
-
- super(NNBase, self).__init__()
- self._hidden_size = hidden_size
- self._recurrent = recurrent
-
- if recurrent:
- self.gru = nn.GRUCell(recurrent_input_size, hidden_size)
- nn.init.orthogonal_(self.gru.weight_ih.data)
- nn.init.orthogonal_(self.gru.weight_hh.data)
- self.gru.bias_ih.data.fill_(0)
- self.gru.bias_hh.data.fill_(0)
-
- @property
- def is_recurrent(self):
- return self._recurrent
-
- @property
- def rec_state_size(self):
- if self._recurrent:
- return self._hidden_size
- return 1
-
- @property
- def output_size(self):
- return self._hidden_size
-
- def _forward_gru(self, x, hxs, masks):
- if x.size(0) == hxs.size(0):
- x = hxs = self.gru(x, hxs * masks[:, None])
- else:
- # x is a (T, N, -1) tensor that has been flatten to (T * N, -1)
- N = hxs.size(0)
- T = int(x.size(0) / N)
-
- # unflatten
- x = x.view(T, N, x.size(1))
-
- # Same deal with masks
- masks = masks.view(T, N, 1)
-
- outputs = []
- for i in range(T):
- hx = hxs = self.gru(x[i], hxs * masks[i])
- outputs.append(hx)
-
- # x is a (T, N, -1) tensor
- x = torch.stack(outputs, dim=0)
- # flatten
- x = x.view(T * N, -1)
-
- return x, hxs
diff --git a/utils/optimization.py b/utils/optimization.py
deleted file mode 100644
index 7f4050b..0000000
--- a/utils/optimization.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import inspect
-import re
-
-from torch import optim
-
-
-def get_optimizer(parameters, s):
- """
- Parse optimizer parameters.
- Input should be of the form:
- - "sgd,lr=0.01"
- - "adagrad,lr=0.1,lr_decay=0.05"
- """
- if "," in s:
- method = s[:s.find(',')]
- optim_params = {}
- for x in s[s.find(',') + 1:].split(','):
- split = x.split('=')
- assert len(split) == 2
- assert re.match(
- r"^[+-]?(\d+(\.\d*)?|\.\d+)$",
- split[1]) is not None
- optim_params[split[0]] = float(split[1])
- else:
- method = s
- optim_params = {}
-
- if method == 'adadelta':
- optim_fn = optim.Adadelta
- elif method == 'adagrad':
- optim_fn = optim.Adagrad
- elif method == 'adam':
- optim_fn = optim.Adam
- optim_params['betas'] = (optim_params.get('beta1', 0.5),
- optim_params.get('beta2', 0.999))
- optim_params.pop('beta1', None)
- optim_params.pop('beta2', None)
- elif method == 'adamax':
- optim_fn = optim.Adamax
- elif method == 'asgd':
- optim_fn = optim.ASGD
- elif method == 'rmsprop':
- optim_fn = optim.RMSprop
- elif method == 'rprop':
- optim_fn = optim.Rprop
- elif method == 'sgd':
- optim_fn = optim.SGD
- assert 'lr' in optim_params
- else:
- raise Exception('Unknown optimization method: "%s"' % method)
-
- # check that we give good parameters to the optimizer
- expected_args = inspect.getargspec(optim_fn.__init__)[0]
- assert expected_args[:2] == ['self', 'params']
- if not all(k in expected_args[2:] for k in optim_params.keys()):
- raise Exception('Unexpected parameters: expected "%s", got "%s"' % (
- str(expected_args[2:]), str(optim_params.keys())))
-
- return optim_fn(parameters, **optim_params)
diff --git a/utils/storage.py b/utils/storage.py
deleted file mode 100644
index e71cac3..0000000
--- a/utils/storage.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# The following code is largely borrowed from:
-# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/storage.py
-
-from collections import namedtuple
-
-import numpy as np
-import torch
-from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
-
-
-def _flatten_helper(T, N, _tensor):
- return _tensor.view(T * N, *_tensor.size()[2:])
-
-
-class RolloutStorage(object):
-
- def __init__(self, num_steps, num_processes, obs_shape, action_space,
- rec_state_size):
-
- if action_space.__class__.__name__ == 'Discrete':
- self.n_actions = 1
- action_type = torch.long
- else:
- self.n_actions = action_space.shape[0]
- action_type = torch.float32
-
- self.obs = torch.zeros(num_steps + 1, num_processes, *obs_shape)
- self.rec_states = torch.zeros(num_steps + 1, num_processes,
- rec_state_size)
- self.rewards = torch.zeros(num_steps, num_processes)
- self.value_preds = torch.zeros(num_steps + 1, num_processes)
- self.returns = torch.zeros(num_steps + 1, num_processes)
- self.action_log_probs = torch.zeros(num_steps, num_processes)
- self.actions = torch.zeros((num_steps, num_processes, self.n_actions),
- dtype=action_type)
- self.masks = torch.ones(num_steps + 1, num_processes)
-
- self.num_steps = num_steps
- self.step = 0
- self.has_extras = False
- self.extras_size = None
-
- def to(self, device):
- self.obs = self.obs.to(device)
- self.rec_states = self.rec_states.to(device)
- self.rewards = self.rewards.to(device)
- self.value_preds = self.value_preds.to(device)
- self.returns = self.returns.to(device)
- self.action_log_probs = self.action_log_probs.to(device)
- self.actions = self.actions.to(device)
- self.masks = self.masks.to(device)
- if self.has_extras:
- self.extras = self.extras.to(device)
- return self
-
- def insert(self, obs, rec_states, actions, action_log_probs, value_preds,
- rewards, masks):
- self.obs[self.step + 1].copy_(obs)
- self.rec_states[self.step + 1].copy_(rec_states)
- self.actions[self.step].copy_(actions.view(-1, self.n_actions))
- self.action_log_probs[self.step].copy_(action_log_probs)
- self.value_preds[self.step].copy_(value_preds)
- self.rewards[self.step].copy_(rewards)
- self.masks[self.step + 1].copy_(masks)
-
- self.step = (self.step + 1) % self.num_steps
-
- def after_update(self):
- self.obs[0].copy_(self.obs[-1])
- self.rec_states[0].copy_(self.rec_states[-1])
- self.masks[0].copy_(self.masks[-1])
- if self.has_extras:
- self.extras[0].copy_(self.extras[-1])
-
- def compute_returns(self, next_value, use_gae, gamma, tau):
- if use_gae:
- self.value_preds[-1] = next_value
- gae = 0
- for step in reversed(range(self.rewards.size(0))):
- delta = self.rewards[step] + gamma \
- * self.value_preds[step + 1] * self.masks[step + 1] \
- - self.value_preds[step]
- gae = delta + gamma * tau * self.masks[step + 1] * gae
- self.returns[step] = gae + self.value_preds[step]
- else:
- self.returns[-1] = next_value
- for step in reversed(range(self.rewards.size(0))):
- self.returns[step] = self.returns[step + 1] * gamma \
- * self.masks[step + 1] + self.rewards[step]
-
- def feed_forward_generator(self, advantages, num_mini_batch):
-
- num_steps, num_processes = self.rewards.size()[0:2]
- batch_size = num_processes * num_steps
- mini_batch_size = batch_size // num_mini_batch
- assert batch_size >= num_mini_batch, (
- "PPO requires the number of processes ({}) "
- "* number of steps ({}) = {} "
- "to be greater than or equal to "
- "the number of PPO mini batches ({})."
- "".format(num_processes, num_steps, num_processes * num_steps,
- num_mini_batch))
-
- sampler = BatchSampler(SubsetRandomSampler(range(batch_size)),
- mini_batch_size, drop_last=False)
-
- for indices in sampler:
- yield {
- 'obs': self.obs[:-1].view(-1, *self.obs.size()[2:])[indices],
- 'rec_states': self.rec_states[:-1].view(
- -1, self.rec_states.size(-1))[indices],
- 'actions': self.actions.view(-1, self.n_actions)[indices],
- 'value_preds': self.value_preds[:-1].view(-1)[indices],
- 'returns': self.returns[:-1].view(-1)[indices],
- 'masks': self.masks[:-1].view(-1)[indices],
- 'old_action_log_probs': self.action_log_probs.view(-1)[indices],
- 'adv_targ': advantages.view(-1)[indices],
- 'extras': self.extras[:-1].view(
- -1, self.extras_size)[indices]
- if self.has_extras else None,
- }
-
- def recurrent_generator(self, advantages, num_mini_batch):
-
- num_processes = self.rewards.size(1)
- assert num_processes >= num_mini_batch, (
- "PPO requires the number of processes ({}) "
- "to be greater than or equal to the number of "
- "PPO mini batches ({}).".format(num_processes, num_mini_batch))
- num_envs_per_batch = num_processes // num_mini_batch
- perm = torch.randperm(num_processes)
- T, N = self.num_steps, num_envs_per_batch
-
- for start_ind in range(0, num_processes, num_envs_per_batch):
-
- obs = []
- rec_states = []
- actions = []
- value_preds = []
- returns = []
- masks = []
- old_action_log_probs = []
- adv_targ = []
- if self.has_extras:
- extras = []
-
- for offset in range(num_envs_per_batch):
-
- ind = perm[start_ind + offset]
- obs.append(self.obs[:-1, ind])
- rec_states.append(self.rec_states[0:1, ind])
- actions.append(self.actions[:, ind])
- value_preds.append(self.value_preds[:-1, ind])
- returns.append(self.returns[:-1, ind])
- masks.append(self.masks[:-1, ind])
- old_action_log_probs.append(self.action_log_probs[:, ind])
- adv_targ.append(advantages[:, ind])
- if self.has_extras:
- extras.append(self.extras[:-1, ind])
-
- # These are all tensors of size (T, N, ...)
- obs = torch.stack(obs, 1)
- actions = torch.stack(actions, 1)
- value_preds = torch.stack(value_preds, 1)
- returns = torch.stack(returns, 1)
- masks = torch.stack(masks, 1)
- old_action_log_probs = torch.stack(old_action_log_probs, 1)
- adv_targ = torch.stack(adv_targ, 1)
- if self.has_extras:
- extras = torch.stack(extras, 1)
-
- yield {
- 'obs': _flatten_helper(T, N, obs),
- 'actions': _flatten_helper(T, N, actions),
- 'value_preds': _flatten_helper(T, N, value_preds),
- 'returns': _flatten_helper(T, N, returns),
- 'masks': _flatten_helper(T, N, masks),
- 'old_action_log_probs': _flatten_helper(
- T, N, old_action_log_probs),
- 'adv_targ': _flatten_helper(T, N, adv_targ),
- 'extras': _flatten_helper(
- T, N, extras) if self.has_extras else None,
- 'rec_states': torch.stack(rec_states, 1).view(N, -1),
- }
-
-
-class GlobalRolloutStorage(RolloutStorage):
-
- def __init__(self, num_steps, num_processes, obs_shape, action_space,
- rec_state_size, extras_size):
- super(GlobalRolloutStorage, self).__init__(
- num_steps, num_processes, obs_shape, action_space, rec_state_size)
- self.extras = torch.zeros((num_steps + 1, num_processes, extras_size),
- dtype=torch.long)
- self.has_extras = True
- self.extras_size = extras_size
-
- def insert(self, obs, rec_states, actions, action_log_probs, value_preds,
- rewards, masks, extras):
- self.extras[self.step + 1].copy_(extras)
- super(GlobalRolloutStorage, self).insert(
- obs, rec_states, actions,
- action_log_probs, value_preds, rewards, masks)