publications.html

<!DOCTYPE html>
<html lang="en">

<head>
  <meta charset="utf-8">
  <meta content="width=device-width, initial-scale=1.0" name="viewport">

  <title>VLAA</title>
  <meta content="" name="description">
  <meta content="" name="keywords">

  <!-- Favicons -->
  <link href="assets/img/UCSC_icon.png" rel="icon">
  <link href="assets/img/UCSC_icon.png" rel="apple-touch-icon">

  <!-- Google Fonts -->
  <link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,500,500i,600,600i,700,700i|Poppins:300,300i,400,400i,500,500i,600,600i,700,700i" rel="stylesheet">

  <!-- Vendor CSS Files -->
  <link href="assets/vendor/fontawesome-free/css/all.min.css" rel="stylesheet">
  <link href="assets/vendor/animate.css/animate.min.css" rel="stylesheet">
  <link href="assets/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
  <link href="assets/vendor/bootstrap-icons/bootstrap-icons.css" rel="stylesheet">
  <link href="assets/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
  <link href="assets/vendor/glightbox/css/glightbox.min.css" rel="stylesheet">
  <link href="assets/vendor/remixicon/remixicon.css" rel="stylesheet">
  <link href="assets/vendor/swiper/swiper-bundle.min.css" rel="stylesheet">
  <link href="https://fonts.googleapis.com/css?family=Lato:100,300,400,700,900" rel="stylesheet">
  <link rel="stylesheet" type="text/css" media="screen,print" href="assets/css_pub/style.css" />
<!--   <link href="assets/css_pub/bootstrap.min.css" rel="stylesheet" media="screen" /> -->
  <link rel="icon" type="image/png" href="./images/logos/princeton.png">
  <!-- Template Main CSS File -->
  <link href="assets/css/style.css" rel="stylesheet">

  <!-- =======================================================
  * Template Name: Medilab - v4.7.1
  * Template URL: https://bootstrapmade.com/medilab-free-medical-bootstrap-theme/
  * Author: BootstrapMade.com
  * License: https://bootstrapmade.com/license/
  ======================================================== -->
</head>

<body>

  <!-- ======= Top Bar ======= -->
  <div id="topbar" class="d-flex align-items-center fixed-top">
    <div class="container d-flex justify-content-between">
      <div class="contact-info d-flex align-items-center">
      </div>
      <div class="d-none d-lg-flex social-links align-items-center">
        <a href="opening.html" class="envelope"><i class="bi-envelope"></i></a>
      </div>
    </div>
  </div>








  <!-- ======= Header ======= -->
  <header id="header" class="fixed-top">
    <div class="container d-flex align-items-center">

      <h1 class="logo me-auto"><a href="index.html">VLAA lab</a></h1>
      <nav id="navbar" class="navbar order-last order-lg-0">
        <ul>
          <li><a class="nav-link scrollto" href="index.html">Home</a></li>
          <li><a class="nav-link scrollto" href="people.html">People</a></li>
          <li><a class="nav-link scrollto active" href="publications.html">Publications</a></li>
          <li><a class="nav-link scrollto" href="https://github.com/UCSC-VLAA">GitHub</a></li>
          <li><a class="nav-link scrollto" href="https://huggingface.co/UCSC-VLAA">HuggingFace</a></li>
          <li><a class="nav-link scrollto" href="opening.html">Opening</a></li>
        </ul>
        <i class="bi bi-list mobile-nav-toggle"></i>
      </nav><!-- .navbar -->


    </div>
  </header><!-- End Header -->

    <br>
    <br>
    <br>
    <br>
    <br>
    <br>     
    <br>
    <br>
   

    <div class="section-title">
      <h2>Publications</h2>
    </div>       
    
<script>
function copy(dest, source) {
  if(dest.source == source) {
    dest.innerHTML = "";
    dest.source = null;
    dest.style.width="0px";
    dest.style.border = "";
    dest.style.padding = "0px";
  }
  else {
    dest.innerHTML = source.innerHTML;
    dest.source = source;
    dest.style.width = "800px";
    dest.style.padding = "10px";
    dest.style.border = "2px dotted gray";
    dest.style.background = "#F5F5F5";
    dest.style.margin = "10px";
  }
  dest.blur();
}
</script>

<div class="container">
<!-- <h1> Papers</h1> -->
<br>
<!-- <p>(*: equal contribution)</p> -->

<details close>
    <summary><font size="5">Pre-print</font></summary>
    <script>
        paper_count = 0

        function add_paper(title, authors, conference, link, bib, abstract, arxiv_link, code, press, slides, talk, msg) {
            list_entry = "<li style=\"font-size:18px\">"
            if (link != null)
                list_entry += "<a href=\"" + link + "\">"
            list_entry += "<b>" + title + "</b>"
            if (link != null)
                list_entry += "</a>"
            list_entry += "<br>" + authors + ".<br>" 
            if (conference != null)
                list_entry+= conference + ".</li>"
            if (bib != null) {
                list_entry += "<div id=\"bib" + paper_count + "\" style=\"display:none\">" + bib + "</div>"
                list_entry += "<a href=\"javascript:copy(div" + paper_count + ",bib" + paper_count + ")\"> <span class=\"label label-success\">bib</span></a>"
            }

            if (abstract != null) {
                list_entry += "<div id=\"abstract" + paper_count + "\" style=\"display:none\">" + abstract + "</div>"
                list_entry += "<a href=\"javascript:copy(div" + paper_count + ",abstract" + paper_count + ")\"> <span class=\"label label-warning\">abstract</span></a>"
            }
            if (arxiv_link != null)
                list_entry += " <a href=\"" + arxiv_link + "\"><span class=\"label label-primary\">arxiv</span></a>"

            if (code != null)
                list_entry += " <a href=\"" + code + "\"><span class=\"label label-danger\">code/models</span></a>"

            if (press != null)
                list_entry += " <a href=\"" + press + "\"><span class=\"label label-success\">press</span></a>"

            if (slides != null)
                list_entry += " <a href=\"" + slides + "\"><span class=\"label label-info\">slides/poster</span></a>"

            if (talk != null)
                list_entry += " <a href=\"" + talk + "\"><span class=\"label label-default\">talk</span></a>"

            list_entry += "<br>"

            if (msg != null)
                list_entry += "<i>" + msg + "</i>"

            list_entry += "<div id=\"div" + paper_count + "\" style=\"font-size:15px\"></div><br>"

            document.write(list_entry)

            paper_count += 1
        }

        // document.write("<h2>Preprint</h2>")
        // document.write("<ul>")
        document.write("</ul>")
        // document.write("<h2>Preprint</h2>")
        document.write("<ul><br>")

        add_paper("Complex-Edit: CoT-Like Instruction Generation for Complexity-Controllable Image Editing Benchmark",
          "Siwei Yang, Mude Hui, Bingchen Zhao, Yuyin Zhou, Nataniel Ruiz, Cihang Xie",
          null,
          "https://arxiv.org/abs/2504.13143", 
          "@article{yang2025textttcomplexeditcotlikeinstructiongeneration,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {Complex-Edit: CoT-Like Instruction Generation for Complexity-Controllable Image Editing Benchmark},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Siwei Yang, Mude Hui, Bingchen Zhao, Yuyin Zhou, Nataniel Ruiz, Cihang Xie},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2504.13143},<br>" + 
          "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "We introduce Complex-Edit, a comprehensive benchmark designed to systematically evaluate instruction-based image editing models across instructions of varying complexity. To develop this benchmark, we harness GPT-4o to automatically collect a diverse set of editing instructions at scale. Our approach follows a well-structured ``Chain-of-Edit'' pipeline: we first generate individual atomic editing tasks independently and then integrate them to form cohesive, complex instructions. Additionally, we introduce a suite of metrics to assess various aspects of editing performance, along with a VLM-based auto-evaluation pipeline that supports large-scale assessments. Our benchmark yields several notable insights: 1) Open-source models significantly underperform relative to proprietary, closed-source models, with the performance gap widening as instruction complexity increases; 2) Increased instructional complexity primarily impairs the models' ability to retain key elements from the input images and to preserve the overall aesthetic quality; 3) Decomposing a complex instruction into a sequence of atomic steps, executed in a step-by-step manner, substantially degrades performance across multiple metrics; 4) A straightforward Best-of-N selection strategy improves results for both direct editing and the step-by-step sequential approach; and 5) We observe a ``curse of synthetic data'': when synthetic data is involved in model training, the edited images from such models tend to appear increasingly synthetic as the complexity of the editing instructions rises -- a phenomenon that intriguingly also manifests in the latest GPT-4o outputs.",
          "https://arxiv.org/abs/2504.13143",
          "https://github.com/UCSC-VLAA/Complex-Edit"
        )

        add_paper("MedSegFactory: Text-Guided Generation of Medical Image-Mask Pairs",
          "Jiawei Mao, Yuhan Wang, Yucheng Tang, Daguang Xu, Kang Wang, Yang Yang, Zongwei Zhou, Yuyin Zhou",
          null,
          "https://arxiv.org/abs/2504.06897", 
          "@article{mao2025medsegfactory,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {MedSegFactory: Text-Guided Generation of Medical Image-Mask Pairs},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Jiawei Mao, Yuhan Wang, Yucheng Tang, Daguang Xu, Kang Wang, Yang Yang, Zongwei Zhou, Yuyin Zhou},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2504.06897},<br>" + 
          "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "This paper presents MedSegFactory, a versatile medical synthesis framework that generates high-quality paired medical images and segmentation masks across modalities and tasks. It aims to serve as an unlimited data repository, supplying image-mask pairs to enhance existing segmentation tools. The core of MedSegFactory is a dual-stream diffusion model, where one stream synthesizes medical images and the other generates corresponding segmentation masks. To ensure precise alignment between image-mask pairs, we introduce Joint Cross-Attention (JCA), enabling a collaborative denoising paradigm by dynamic cross-conditioning between streams. This bidirectional interaction allows both representations to guide each other's generation, enhancing consistency between generated pairs. MedSegFactory unlocks on-demand generation of paired medical images and segmentation masks through user-defined prompts that specify the target labels, imaging modalities, anatomical regions, and pathological conditions, facilitating scalable and high-quality data generation. This new paradigm of medical image synthesis enables seamless integration into diverse medical imaging workflows, enhancing both efficiency and accuracy. Extensive experiments show that MedSegFactory generates data of superior quality and usability, achieving competitive or state-of-the-art performance in 2D and 3D segmentation tasks while addressing data scarcity and regulatory constraints.",
          "https://arxiv.org/abs/2504.06897",
          "https://github.com/jwmao1/MedSegFactory"
        )

        add_paper("A Comprehensive Analysis of Mamba for 3D Volumetric Medical Image Segmentation",
          "Chaohan Wang, Yutong Xie, Qi Chen, Yuyin Zhou, Qi Wu",
          null,
          "https://arxiv.org/abs/2503.19308", 
          "@article{wang2025comprehensive,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {A Comprehensive Analysis of Mamba for 3D Volumetric Medical Image Segmentation},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Chaohan Wang, Yutong Xie, Qi Chen, Yuyin Zhou, Qi Wu},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2503.19308},<br>" + 
          "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "Mamba, with its selective State Space Models (SSMs), offers a more computationally efficient solution than Transformers for long-range dependency modeling. However, there is still a debate about its effectiveness in high-resolution 3D medical image segmentation. In this study, we present a comprehensive investigation into Mamba's capabilities in 3D medical image segmentation by tackling three pivotal questions: Can Mamba replace Transformers? Can it elevate multi-scale representation learning? Is complex scanning necessary to unlock its full potential? We evaluate Mamba's performance across three large public benchmarks-AMOS, TotalSegmentator, and BraTS. Our findings reveal that UlikeMamba, a U-shape Mamba-based network, consistently surpasses UlikeTrans, a U-shape Transformer-based network, particularly when enhanced with custom-designed 3D depthwise convolutions, boosting accuracy and computational efficiency. Further, our proposed multi-scale Mamba block demonstrates superior performance in capturing both fine-grained details and global context, especially in complex segmentation tasks, surpassing Transformer-based counterparts. We also critically assess complex scanning strategies, finding that simpler methods often suffice, while our Tri-scan approach delivers notable advantages in the most challenging scenarios. By integrating these advancements, we introduce a new network for 3D medical image segmentation, positioning Mamba as a transformative force that outperforms leading models such as nnUNet, CoTr, and U-Mamba, offering competitive accuracy with superior computational efficiency. This study provides key insights into Mamba's unique advantages, paving the way for more efficient and accurate approaches to 3D medical imaging.",
          "https://arxiv.org/abs/2503.19308",
          "https://arxiv.org/abs/2503.19308"
        )

        add_paper("ViLBench: A Suite for Vision-Language Process Reward Modeling",
          "Haoqin Tu, Weitao Feng, Hardy Chen, Hui Liu, Xianfeng Tang, Cihang Xie",
          null,
          "https://arxiv.org/abs/2503.20271", 
          "@article{tu2025vilbench,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {ViLBench: A Suite for Vision-Language Process Reward Modeling},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Haoqin Tu, Weitao Feng, Hardy Chen, Hui Liu, Xianfeng Tang, Cihang Xie},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2503.20271},<br>" + 
          "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "Process-supervised reward models serve as a fine-grained function that provides detailed step-wise feedback to model responses, facilitating effective selection of reasoning trajectories for complex tasks. Despite its advantages, evaluation on PRMs remains less explored, especially in the multimodal domain. To address this gap, this paper first benchmarks current vision large language models (VLLMs) as two types of reward models: output reward models (ORMs) and process reward models (PRMs) on multiple vision-language benchmarks, which reveal that neither ORM nor PRM consistently outperforms across all tasks, and superior VLLMs do not necessarily yield better rewarding performance. To further advance evaluation, we introduce ViLBench, a vision-language benchmark designed to require intensive process reward signals. Notably, OpenAI's GPT-4o with Chain-of-Thought (CoT) achieves only 27.3% accuracy, indicating the benchmark's challenge for current VLLMs. Lastly, we preliminarily showcase a promising pathway towards bridging the gap between general VLLMs and reward models -- by collecting 73.6K vision-language process reward data using an enhanced tree-search algorithm, our 3B model is able to achieve an average improvement of 3.3% over standard CoT and up to 2.5% compared to its untrained counterpart on ViLBench by selecting OpenAI o1's generations. We release the implementations with our code, model, and data.",
          "https://arxiv.org/abs/2503.20271",
          "https://ucsc-vlaa.github.io/ViLBench/"
        )

        add_paper("Exploring the Vulnerabilities of Federated Learning: A Deep Dive into Gradient Inversion Attacks",
          "Pengxin Guo, Runxi Wang, Shuang Zeng, Jinjing Zhu, Haoning Jiang, Yanran Wang, Yuyin Zhou, Feifei Wang, Hui Xiong, Liangqiong Qu",
          null,
          "https://arxiv.org/abs/2503.11514", 
          "@article{guo2025exploring,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {Exploring the Vulnerabilities of Federated Learning: A Deep Dive into Gradient Inversion Attacks},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Pengxin Guo, Runxi Wang, Shuang Zeng, Jinjing Zhu, Haoning Jiang, Yanran Wang, Yuyin Zhou, Feifei Wang, Hui Xiong, Liangqiong Qu},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2503.11514},<br>" + 
          "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "Federated Learning (FL) has emerged as a promising privacy-preserving collaborative model training paradigm without sharing raw data. However, recent studies have revealed that private information can still be leaked through shared gradient information and attacked by Gradient Inversion Attacks (GIA). While many GIA methods have been proposed, a detailed analysis, evaluation, and summary of these methods are still lacking. Although various survey papers summarize existing privacy attacks in FL, few studies have conducted extensive experiments to unveil the effectiveness of GIA and their associated limiting factors in this context. To fill this gap, we first undertake a systematic review of GIA and categorize existing methods into three types, i.e., \textit{optimization-based} GIA (OP-GIA), \textit{generation-based} GIA (GEN-GIA), and \textit{analytics-based} GIA (ANA-GIA). Then, we comprehensively analyze and evaluate the three types of GIA in FL, providing insights into the factors that influence their performance, practicality, and potential threats. Our findings indicate that OP-GIA is the most practical attack setting despite its unsatisfactory performance, while GEN-GIA has many dependencies and ANA-GIA is easily detectable, making them both impractical. Finally, we offer a three-stage defense pipeline to users when designing FL frameworks and protocols for better privacy protection and share some future research directions from the perspectives of attackers and defenders that we believe should be pursued. We hope that our study can help researchers design more robust FL frameworks to defend against these attacks.",
          "https://arxiv.org/abs/2503.11514",
          "https://arxiv.org/abs/2503.11514"
        )

        add_paper("SFT or RL? An Early Investigation into Training R1-Like Reasoning Large Vision-Language Models",
            "Hardy Chen, Haoqin Tu, Fali Wang, Hui Liu, Xianfeng Tang, Xinya Du, Yuyin Zhou, Cihang Xie",
            null,
            "https://arxiv.org/abs/2504.11468", 
            "@article{wang2025star,<br>" +
            "&nbsp;&nbsp;&nbsp;title   = {SFT or RL? An Early Investigation into Training R1-Like Reasoning Large Vision-Language Models},<br>" +
            "&nbsp;&nbsp;&nbsp;author  = {Hardy Chen, Haoqin Tu, Fali Wang, Hui Liu, Xianfeng Tang, Xinya Du, Yuyin Zhou, Cihang Xie},<br>" +
            "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2504.11468},<br>" + 
            "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
            "This work revisits the dominant supervised fine-tuning (SFT) then reinforcement learning (RL) paradigm for training Large Vision-Language Models (LVLMs), and reveals a key finding: SFT can significantly undermine subsequent RL by inducing ``pseudo reasoning paths'' imitated from expert models. While these paths may resemble the native reasoning paths of RL models, they often involve prolonged, hesitant, less informative steps, and incorrect reasoning. To systematically study this effect, we introduce VLAA-Thinking, a new multimodal dataset designed to support reasoning in LVLMs. Constructed via a six-step pipeline involving captioning, reasoning distillation, answer rewrite and verification, VLAA-Thinking comprises high-quality, step-by-step visual reasoning traces for SFT, along with a more challenging RL split from the same data source. Using this dataset, we conduct extensive experiments comparing SFT, RL and their combinations. Results show that while SFT helps models learn reasoning formats, it often locks aligned models into imitative, rigid reasoning modes that impede further learning. In contrast, building on the Group Relative Policy Optimization (GRPO) with a novel mixed reward module integrating both perception and cognition signals, our RL approach fosters more genuine, adaptive reasoning behavior. Notably, our model VLAA-Thinker, based on Qwen2.5VL 3B, achieves top-1 performance on Open LMM Reasoning Leaderboard among 4B scale LVLMs, surpassing the previous state-of-the-art by 1.8%. We hope our findings provide valuable insights in developing reasoning-capable LVLMs and can inform future research in this area.",
            "https://arxiv.org/abs/2504.11468",
            "https://github.com/UCSC-VLAA/VLAA-Thinking"
        )

        add_paper("MedReason: Eliciting Factual Medical Reasoning Steps in LLMs via Knowledge Graphs",
            "Juncheng Wu, Wenlong Deng, Xingxuan Li, Sheng Liu, Taomian Mi, Yifan Peng, Ziyang Xu, Yi Liu, Hyunjin Cho, Chang-In Choi, Yihan Cao, Hui Ren, Xiang Li, Xiaoxiao Li, Yuyin Zhou",
            null,
            "https://arxiv.org/pdf/2504.00993", 
            "@article{wu2025medreason,<br>" +
            "&nbsp;&nbsp;&nbsp;title   = {MedReason: Eliciting Factual Medical Reasoning Steps in LLMs via Knowledge Graphs},<br>" +
            "&nbsp;&nbsp;&nbsp;author  = {Juncheng Wu, Wenlong Deng, Xingxuan Li, Sheng Liu, Taomian Mi, Yifan Peng, Ziyang Xu, Yi Liu, Hyunjin Cho, Chang-In Choi, Yihan Cao, Hui Ren, Xiang Li, Xiaoxiao Li, Yuyin Zhou},<br>" +
            "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2504.00993},<br>" + 
            "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
            "Medical tasks such as diagnosis and treatment planning require precise and complex reasoning, particularly in life-critical domains. Unlike mathematical reasoning, medical reasoning demands meticulous, verifiable thought processes to ensure reliability and accuracy. However, there is a notable lack of datasets that provide transparent, step-by-step reasoning to validate and enhance the medical reasoning ability of AI models. To bridge this gap, we introduce MedReason, a large-scale high-quality medical reasoning dataset designed to enable faithful and explainable medical problem-solving in large language models (LLMs). We utilize a structured medical knowledge graph (KG) to convert clinical QA pairs into logical chains of reasoning, or ``thinking paths'', which trace connections from question elements to answers via relevant KG entities. Each path is validated for consistency with clinical logic and evidence-based medicine. Our pipeline generates detailed reasoning for various medical questions from 7 medical datasets, resulting in a dataset of 32,682 question-answer pairs, each with detailed, step-by-step explanations. Experiments demonstrate that fine-tuning with our dataset consistently boosts medical problem-solving capabilities, achieving significant gains of up to 7.7% for DeepSeek-Ditill-8B. Our top-performing model, MedReason-8B, outperforms the Huatuo-o1-8B, a state-of-the-art medical reasoning model, by up to 4.2% on the clinical benchmark MedBullets. We also engage medical professionals from diverse specialties to assess our dataset's quality, ensuring MedReason offers accurate and coherent medical reasoning.",
            "https://arxiv.org/pdf/2504.00993",
            "https://github.com/UCSC-VLAA/MedReason"
        )

        add_paper("m1: Unleash the Potential of Test-Time Scaling for Medical Reasoning with Large Language Models",
            "Xiaoke Huang, Juncheng Wu, Hui Liu, Xianfeng Tang, Yuyin Zhou",
            null,
            "https://arxiv.org/abs/2504.00869", 
            "@article{huang2025m1,<br>" +
            "&nbsp;&nbsp;&nbsp;title   = {m1: Unleash the Potential of Test-Time Scaling for Medical Reasoning with Large Language Models},<br>" +
            "&nbsp;&nbsp;&nbsp;author  = {Xiaoke Huang, Juncheng Wu, Hui Liu, Xianfeng Tang, Yuyin Zhou},<br>" +
            "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2504.00869},<br>" + 
            "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
            "Test-time scaling has emerged as a powerful technique for enhancing the reasoning capabilities of large language models. However, its effectiveness in medical reasoning remains uncertain, as the medical domain fundamentally differs from mathematical tasks in terms of knowledge representation and decision-making processes. In this paper, we provide the first comprehensive investigation of test-time scaling for medical reasoning and present m1, a simple yet effective approach that increases a model's medical reasoning capability at inference. Our evaluation across diverse medical tasks demonstrates that test-time scaling consistently enhances medical reasoning, enabling lightweight fine-tuned models under 10B parameters to establish new state-of-the-art performance, while our 32B model rivals previous 70B-scale medical LLMs. However, we identify an optimal reasoning token budget of approximately 4K, beyond which performance may degrade due to overthinking. Budget forcing, which extends test-time computation through iterative prompts, helps models double-check answers but does not necessarily improve the overall medical QA performance and, in some cases, even introduces errors into previously correct responses. Our case-by-case analysis identifies insufficient medical knowledge as a key bottleneck that prevents further performance gains through test-time scaling. We find that increasing data scale, improving data quality, and expanding model capacity consistently enhance medical knowledge grounding, enabling continued performance improvements, particularly on challenging medical benchmarks where smaller models reach saturation. These findings underscore fundamental differences between medical and mathematical reasoning in LLMs, highlighting that enriched medical knowledge, other than increased reasoning depth alone, is essential for realizing the benefits of test-time scaling.",
            "https://arxiv.org/abs/2504.00869",
            "https://github.com/UCSC-VLAA/m1"
        )

        add_paper("STAR-1: Safer Alignment of Reasoning LLMs with 1K Data",
            "Zijun Wang, Haoqin Tu, Yuhan Wang, Juncheng Wu, Jieru Mei, Brian R. Bartoldson, Bhavya Kailkhura, Cihang Xie",
            null,
            "https://arxiv.org/abs/2504.01903", 
            "@article{wang2025star,<br>" +
            "&nbsp;&nbsp;&nbsp;title   = {STAR-1: Safer Alignment of Reasoning LLMs with 1K Data},<br>" +
            "&nbsp;&nbsp;&nbsp;author  = {Zijun Wang, Haoqin Tu, Yuhan Wang, Juncheng Wu, Jieru Mei, Brian R. Bartoldson, Bhavya Kailkhura, Cihang Xie},<br>" +
            "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2504.01903},<br>" + 
            "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
            "This paper introduces STAR-1, a high-quality, just-1k-scale safety dataset specifically designed for large reasoning models (LRMs) like DeepSeek-R1. Built on three core principles -- diversity, deliberative reasoning, and rigorous filtering -- STAR-1 aims to address the critical needs for safety alignment in LRMs. Specifically, we begin by integrating existing open-source safety datasets from diverse sources. Then, we curate safety policies to generate policy-grounded deliberative reasoning samples. Lastly, we apply a GPT-4o-based safety scoring system to select training examples aligned with best practices. Experimental results show that fine-tuning LRMs with STAR-1 leads to an average 40% improvement in safety performance across four benchmarks, while only incurring a marginal decrease (e.g., an average of 1.1%) in reasoning ability measured across five reasoning tasks. Extensive ablation studies further validate the importance of our design principles in constructing STAR-1 and analyze its efficacy across both LRMs and traditional LLMs.",
            "https://arxiv.org/abs/2504.01903",
            "https://github.com/UCSC-VLAA/STAR-1"
        )
        

        add_paper("EpiFoundation: A Foundation Model for Single-Cell ATAC-seq via Peak-to-Gene Alignment",
          "Juncheng Wu, Changxin Wan, Zhicheng Ji, Yuyin Zhou, Wenpin Hou",
          null,
          "https://www.biorxiv.org/content/10.1101/2025.02.05.636688", 
          "@article{wu2025epifoundation,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {EpiFoundation: A Foundation Model for Single-Cell ATAC-seq via Peak-to-Gene Alignment},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Juncheng Wu, Changxin Wan, Zhicheng Ji, Yuyin Zhou, Wenpin Hou},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {bioRxiv},<br>" + 
          "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "Foundation models exhibit strong capabilities for downstream tasks by learning generalized representations through self-supervised pre-training on large datasets. While several foundation models have been developed for single-cell RNA-seq (scRNA-seq) data, there is still a lack of models specifically tailored for single-cell ATAC-seq (scATAC-seq), which measures epigenetic information in individual cells. The principal challenge in developing such a model lies in the vast number of scATAC peaks and the significant sparsity of the data, which complicates the formulation of peak-to-peak correlations. To address this challenge, we introduce EpiFoundation, a foundation model for learning cell representations from the high-dimensional and sparse space of peaks. Epi-Foundation relies on an innovative cross-modality pre-training procedure with two key technical innovations. First, EpiFoundation exclusively processes the non-zero peak set, thereby enhancing the density of cell-specific information within the input data. Second, EpiFoundation utilizes dense gene expression information to supervise the pre-training process, aligning peak-to-gene correlations. EpiFoundation can handle various types of downstream tasks, including cell-type annotation, batch correction, and gene expression prediction. To train and validate EpiFoundation, we curated MiniAtlas, a dataset of 100,000+ single cells with paired scRNA-seq and scATAC-seq data, along with diverse test sets spanning various tissues and cell types for robust evaluation. EpiFoundation demonstrates state-of-the-art performance across multiple tissues and diverse downstream tasks.",
          "https://www.biorxiv.org/content/10.1101/2025.02.05.636688",
          "https://github.com/UCSC-VLAA/EpiFoundation"
        )


        add_paper("MethylProphet: A Generalized Gene-Contextual Model for Inferring Whole-Genome DNA Methylation Landscape",
          "Xiaoke Huang, Qi Liu, Yifei Zhao, Xianfeng Tang, Yuyin Zhou, Wenpin Hou",
          null,
          "https://www.biorxiv.org/content/10.1101/2025.02.05.636730", 
          "@article{huang2025methylprophet,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {MethylProphet: A Generalized Gene-Contextual Model for Inferring Whole-Genome DNA Methylation Landscape},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Xiaoke Huang, Qi Liu, Yifei Zhao, Xianfeng Tang, Yuyin Zhou, Wenpin Hou},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {bioRxiv},<br>" + 
          "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "DNA methylation (DNAm), an epigenetic modification, regulates gene expression, influences phenotypes, and encodes inheritable information, making it critical for disease diagnosis, treatment, and prevention. While human genome contains approximately 28 million CpG sites where DNAm can be measured, only 1–3% of these sites are typically available in most datasets due to complex experimental protocols and high costs, hindering insights from DNAm data. Leveraging the relationship between gene expression and DNAm offers promise for computational inference, but existing statistical, machine learning, and masking-based generative Transformers face critical limitations: they cannot infer DNAm at unmeasured CpGs or in new samples effectively. To overcome these challenges, we introduce MethylProphet, a gene-guided, context-aware Transformer model designed for DNAm inference. MethylProphet employs a Bottleneck MLP for efficient gene profile compression and a specialized DNA sequence tokenizer, integrating global gene expression patterns with local CpG context through a Transformer encoder architecture. Trained on whole-genome bisulfite sequencing data from ENCODE (1.6B training CpG-sample pairs; 322B tokens), MethylProphet demonstrates strong performance in hold-out evaluations, effectively inferring DNAm for unmeasured CpGs and new samples. In addition, its application to 10842 pairs of gene expression and DNAm samples at TCGA chromosome 1 (450M training CpGsample pairs; 91B tokens) highlights its potential to facilitate pan-cancer DNAm landscape inference, offering a powerful tool for advancing epigenetic research and precision medicine. All codes, data, protocols, and models are publicly available via https://github.com/xk-huang/methylprophet/.",
          "https://www.biorxiv.org/content/10.1101/2025.02.05.636730", 
          "https://github.com/xk-huang/methylprophet/"
        )



        add_paper("Scaling Laws in Patchification: An Image Is Worth 50,176 Tokens And More",
          "Feng Wang, Yaodong Yu, Guoyizhe Wei, Wei Shao, Yuyin Zhou, Alan Yuille, Cihang Xie",
          null,
          "https://arxiv.org/abs/2502.03738", 
          "@article{wang2025scaling,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {Scaling Laws in Patchification: An Image Is Worth 50,176 Tokens And More},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Feng Wang, Yaodong Yu, Guoyizhe Wei, Wei Shao, Yuyin Zhou, Alan Yuille, Cihang Xie},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2502.03738},<br>" + 
          "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "Since the introduction of Vision Transformer (ViT), patchification has long been regarded as a de facto image tokenization approach for plain visual architectures. By compressing the spatial size of images, this approach can effectively shorten the token sequence and reduce the computational cost of ViT-like plain architectures. In this work, we aim to thoroughly examine the information loss caused by this patchification-based compressive encoding paradigm and how it affects visual understanding. We conduct extensive patch size scaling experiments and excitedly observe an intriguing scaling law in patchification: the models can consistently benefit from decreased patch sizes and attain improved predictive performance, until it reaches the minimum patch size of 1x1, i.e., pixel tokenization. This conclusion is broadly applicable across different vision tasks, various input scales, and diverse architectures such as ViT and the recent Mamba models. Moreover, as a by-product, we discover that with smaller patches, task-specific decoder heads become less critical for dense prediction. In the experiments, we successfully scale up the visual sequence to an exceptional length of 50,176 tokens, achieving a competitive test accuracy of 84.6% with a base-sized model on the ImageNet-1k benchmark. We hope this study can provide insights and theoretical foundations for future works of building non-compressive vision models. Code is available at https://github.com/wangf3014/Patch_Scaling.",
          "https://arxiv.org/abs/2502.03738", 
          "https://github.com/wangf3014/Patch_Scaling"
        )


        add_paper("ARFlow: Autogressive Flow with Hybrid Linear Attention",
          "Mude Hui, Rui-Jie Zhu, Songlin Yang, Yu Zhang, Zirui Wang, Yuyin Zhou, Jason Eshraghian, Cihang Xie",
          null,
          "https://arxiv.org/abs/2501.16085", 
          "@article{hui2025arflow,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {ARFlow: Autogressive Flow with Hybrid Linear Attention},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Mude Hui, Rui-Jie Zhu, Songlin Yang, Yu Zhang, Zirui Wang, Yuyin Zhou, Jason Eshraghian, Cihang Xie},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2501.16085},<br>" + 
          "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "Flow models are effective at progressively generating realistic images, but they generally struggle to capture long-range dependencies during the generation process as they compress all the information from previous time steps into a single corrupted image. To address this limitation, we propose integrating autoregressive modeling -- known for its excellence in modeling complex, high-dimensional joint probability distributions -- into flow models. During training, at each step, we construct causally-ordered sequences by sampling multiple images from the same semantic category and applying different levels of noise, where images with higher noise levels serve as causal predecessors to those with lower noise levels. This design enables the model to learn broader category-level variations while maintaining proper causal relationships in the flow process. During generation, the model autoregressively conditions the previously generated images from earlier denoising steps, forming a contextual and coherent generation trajectory. Additionally, we design a customized hybrid linear attention mechanism tailored to our modeling approach to enhance computational efficiency. Our approach, termed ARFlow, under 400k training steps, achieves 14.08 FID scores on ImageNet at 128 * 128 without classifier-free guidance, reaching 4.34 FID with classifier-free guidance 1.5, significantly outperforming the previous flow-based model SiT's 9.17 FID. Extensive ablation studies demonstrate the effectiveness of our modeling strategy and chunk-wise attention design.",
          "https://arxiv.org/abs/2501.16085"
        )


        add_paper("UD-Mamba: A pixel-level uncertainty-driven Mamba model for medical image segmentation",
          "Weiren Zhao, Feng Wang, Yanran Wang, Yutong Xie, Qi Wu, Yuyin Zhou",
          null,
          "https://arxiv.org/abs/2502.02024", 
          "@article{zhao2025udmamba,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {UD-Mamba: A pixel-level uncertainty-driven Mamba model for medical image segmentation},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Weiren Zhao, Feng Wang, Yanran Wang, Yutong Xie, Qi Wu, Yuyin Zhou},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2502.02024},<br>" + 
          "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "Recent advancements have highlighted the Mamba framework, a state-space model known for its efficiency in capturing long-range dependencies with linear computational complexity. While Mamba has shown competitive performance in medical image segmentation, it encounters difficulties in modeling local features due to the sporadic nature of traditional location-based scanning methods and the complex, ambiguous boundaries often present in medical images. To overcome these challenges, we propose Uncertainty-Driven Mamba (UD-Mamba), which redefines the pixel-order scanning process by incorporating channel uncertainty into the scanning mechanism. UD-Mamba introduces two key scanning techniques: 1) sequential scanning, which prioritizes regions with high uncertainty by scanning in a row-by-row fashion, and 2) skip scanning, which processes columns vertically, moving from high-to-low or low-to-high uncertainty at fixed intervals. Sequential scanning efficiently clusters high-uncertainty regions, such as boundaries and foreground objects, to improve segmentation precision, while skip scanning enhances the interaction between background and foreground regions, allowing for timely integration of background information to support more accurate foreground inference. Recognizing the advantages of scanning from certain to uncertain areas, we introduce four learnable parameters to balance the importance of features extracted from different scanning methods. Additionally, a cosine consistency loss is employed to mitigate the drawbacks of transitioning between uncertain and certain regions during the scanning process. Our method demonstrates robust segmentation performance, validated across three distinct medical imaging datasets involving pathology, dermatological lesions, and cardiac tasks.",
          "https://arxiv.org/abs/2502.02024" 
        )


        add_paper("Safety at Scale: A Comprehensive Survey of Large Model Safety",
          "Xingjun Ma, Yifeng Gao, Yixu Wang, Ruofan Wang, Xin Wang, Ye Sun, Yifan Ding, Hengyuan Xu, Yunhao Chen, Yunhan Zhao, Hanxun Huang, Yige Li, Jiaming Zhang, Xiang Zheng, Yang Bai, Zuxuan Wu, Xipeng Qiu, Jingfeng Zhang, Yiming Li, Jun Sun, Cong Wang, Jindong Gu, Baoyuan Wu, Siheng Chen, Tianwei Zhang, Yang Liu, Mingming Gong, Tongliang Liu, Shirui Pan, Cihang Xie, Tianyu Pang, Yinpeng Dong, Ruoxi Jia, Yang Zhang, Shiqing Ma, Xiangyu Zhang, Neil Gong, Chaowei Xiao, Sarah Erfani, Bo Li, Masashi Sugiyama, Dacheng Tao, James Bailey, Yu-Gang Jiang",
          null,
          "https://arxiv.org/abs/2502.05206", 
          "@article{ma2025safety,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {Safety at Scale: A Comprehensive Survey of Large Model Safety},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Xingjun Ma, Yifeng Gao, Yixu Wang, Ruofan Wang, Xin Wang, Ye Sun, Yifan Ding, Hengyuan Xu, Yunhao Chen, Yunhan Zhao, Hanxun Huang, Yige Li, Jiaming Zhang, Xiang Zheng, Yang Bai, Zuxuan Wu, Xipeng Qiu, Jingfeng Zhang, Yiming Li, Jun Sun, Cong Wang, Jindong Gu, Baoyuan Wu, Siheng Chen, Tianwei Zhang, Yang Liu, Mingming Gong, Tongliang Liu, Shirui Pan, Cihang Xie, Tianyu Pang, Yinpeng Dong, Ruoxi Jia, Yang Zhang, Shiqing Ma, Xiangyu Zhang, Neil Gong, Chaowei Xiao, Sarah Erfani, Bo Li, Masashi Sugiyama, Dacheng Tao, James Bailey, Yu-Gang Jiang},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2502.05206},<br>" + 
          "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "The rapid advancement of large models, driven by their exceptional abilities in learning and generalization through large-scale pre-training, has reshaped the landscape of Artificial Intelligence (AI). These models are now foundational to a wide range of applications, including conversational AI, recommendation systems, autonomous driving, content generation, medical diagnostics, and scientific discovery. However, their widespread deployment also exposes them to significant safety risks, raising concerns about robustness, reliability, and ethical implications. This survey provides a systematic review of current safety research on large models, covering Vision Foundation Models (VFMs), Large Language Models (LLMs), Vision-Language Pre-training (VLP) models, Vision-Language Models (VLMs), Diffusion Models (DMs), and large-model-based Agents. Our contributions are summarized as follows: (1) We present a comprehensive taxonomy of safety threats to these models, including adversarial attacks, data poisoning, backdoor attacks, jailbreak and prompt injection attacks, energy-latency attacks, data and model extraction attacks, and emerging agent-specific threats. (2) We review defense strategies proposed for each type of attacks if available and summarize the commonly used datasets and benchmarks for safety research. (3) Building on this, we identify and discuss the open challenges in large model safety, emphasizing the need for comprehensive safety evaluations, scalable and effective defense mechanisms, and sustainable data practices. More importantly, we highlight the necessity of collective efforts from the research community and international collaboration. Our work can serve as a useful reference for researchers and practitioners, fostering the ongoing development of comprehensive defense systems and platforms to safeguard AI models.",
          "https://arxiv.org/abs/2502.05206", 
          "https://github.com/xingjunm/Awesome-Large-Model-Safety"
        )


        add_paper("Double Visual Defense: Adversarial Pre-training and Instruction Tuning for Improving Vision-Language Model Robustness",
          "Zeyu Wang, Cihang Xie, Brian Bartoldson, Bhavya Kailkhura",
          null,
          "https://arxiv.org/abs/2501.09446", 
          "@article{wang2024double,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {Double Visual Defense: Adversarial Pre-training and Instruction Tuning for Improving Vision-Language Model Robustness},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Zeyu Wang, Cihang Xie, Brian Bartoldson, Bhavya Kailkhura},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2501.09446},<br>" + 
          "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "This paper investigates the robustness of vision-language models against adversarial visual perturbations and introduces a novel \"double visual defense\" to enhance this robustness. Unlike previous approaches that resort to lightweight adversarial fine-tuning of a pre-trained CLIP model, we perform large-scale adversarial vision-language pre-training from scratch using web-scale data. We then strengthen the defense by incorporating adversarial visual instruction tuning. The resulting models from each stage, ΔCLIP and Δ2LLaVA, show substantially enhanced zero-shot robustness and set a new state-of-the-art in adversarial defense for vision-language models. For example, the adversarial robustness of ΔCLIP surpasses that of the previous best models on ImageNet-1k by ~20%. Similarly, compared to prior art, Δ2LLaVA brings a ~30% robustness improvement to image captioning task and a ~20% robustness improvement to visual question answering task. Furthermore, our models exhibit stronger zero-shot recognition capability, fewer hallucinations, and superior reasoning performance compared to baselines. Our project page is https://doublevisualdefense.github.io/.",
          "https://arxiv.org/abs/2501.09446", 
          "https://doublevisualdefense.github.io/" 
        )



        add_paper("CLIPS: An Enhanced CLIP Framework for Learning with Synthetic Captions",
          "Yanqing Liu, Xianhang Li, Zeyu Wang, Bingchen Zhao, Cihang Xie",
          null,
          "https://arxiv.org/abs/2411.16828",
          "@article{liu2024clips,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {CLIPS: An Enhanced CLIP Framework for Learning with Synthetic Captions},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Yanqing Liu, Xianhang Li, Zeyu Wang, Bingchen Zhao, Cihang Xie},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2411.16828},<br>" +
          "&nbsp;&nbsp;&nbsp;year    = {2024},<br>",
          "Previous works show that noisy, web-crawled image-text pairs may limit vision-language pretraining like CLIP and propose learning with synthetic captions as a promising alternative. Our work continues this effort, introducing two simple yet effective designs to better leverage richly described synthetic captions. Firstly, by observing a strong inverse effect in learning with synthetic captions -- the short synthetic captions can generally lead to MUCH higher performance than full-length ones -- we therefore fed only partial synthetic captions to the text encoder. Secondly, we incorporate an autoregressive captioner to mimic the recaptioning process -- by conditioning on the paired image input and web-crawled text description, the captioner learns to predict the full-length synthetic caption generated by advanced MLLMs. Experiments show that our framework significantly improves zero-shot performance in cross-modal retrieval tasks, setting new SOTA results on MSCOCO and Flickr30K. Moreover, such trained vision encoders can enhance the visual capability of LLaVA, showing strong improvements on a range of MLLM benchmarks. Our project page is https://ucsc-vlaa.github.io/CLIPS/.",
          "https://arxiv.org/abs/2411.16828",
          "https://ucsc-vlaa.github.io/CLIPS/"
          )


        add_paper("M-VAR: Decoupled Scale-wise Autoregressive Modeling for High-Quality Image Generation",
          "Sucheng Ren, Yaodong Yu, Nataniel Ruiz, Feng Wang, Alan Yuille, Cihang Xie",
          null,
          "https://arxiv.org/abs/2411.10433",
          "@article{ren2024mvar,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {M-VAR: Decoupled Scale-wise Autoregressive Modeling for High-Quality Image Generation},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Sucheng Ren, Yaodong Yu, Nataniel Ruiz, Feng Wang, Alan Yuille, Cihang Xie},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2411.10433},<br>" +
          "&nbsp;&nbsp;&nbsp;year    = {2024},<br>",
          "There exists recent work in computer vision, named VAR, that proposes a new autoregressive paradigm for image generation. Diverging from the vanilla next-token prediction, VAR structurally reformulates the image generation into a coarse to fine next-scale prediction. In this paper, we show that this scale-wise autoregressive framework can be effectively decoupled into intra-scale modeling, which captures local spatial dependencies within each scale, and inter-scale modeling, which models cross-scale relationships progressively from coarse-to-fine scales. This decoupling structure allows to rebuild VAR in a more computationally efficient manner. Specifically, for intra-scale modeling -- crucial for generating high-fidelity images -- we retain the original bidirectional self-attention design to ensure comprehensive modeling; for inter-scale modeling, which semantically connects different scales but is computationally intensive, we apply linear-complexity mechanisms like Mamba to substantially reduce computational overhead. We term this new framework M-VAR. Extensive experiments demonstrate that our method outperforms existing models in both image quality and generation speed. For example, our 1.5B model, with fewer parameters and faster inference speed, outperforms the largest VAR-d30-2B. Moreover, our largest model M-VAR-d32 impressively registers 1.78 FID on ImageNet 256x256 and outperforms the prior-art autoregressive models LlamaGen/VAR by 0.4/0.19 and popular diffusion models LDM/DiT by 1.82/0.49, respectively. Code is available at https://github.com/OliverRensu/MVAR.",
          "https://arxiv.org/abs/2411.10433",
          "https://github.com/OliverRensu/MVAR"
          )





        add_paper("Story-Adapter: A Training-free Iterative Framework for Long Story Visualization",
          "Jiawei Mao, Xiaoke Huang, Yunfei Xie, Yuanqi Chang, Mude Hui, Bingjie Xu, Yuyin Zhou",
          null,
          "https://arxiv.org/abs/2410.06244",
          "@article{mao2024storyadapter,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {Story-Adapter: A Training-free Iterative Framework for Long Story Visualization},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Jiawei Mao, Xiaoke Huang, Yunfei Xie, Yuanqi Chang, Mude Hui, Bingjie Xu, Yuyin Zhou},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2410.06244},<br>" +
          "&nbsp;&nbsp;&nbsp;year    = {2024},<br>",
          "Story visualization, the task of generating coherent images based on a narrative, has seen significant advancements with the emergence of text-to-image models, particularly diffusion models. However, maintaining semantic consistency, generating high-quality fine-grained interactions, and ensuring computational feasibility remain challenging, especially in long story visualization (i.e., up to 100 frames). In this work, we propose a training-free and computationally efficient framework, termed Story-Adapter, to enhance the generative capability of long stories. Specifically, we propose an iterative paradigm to refine each generated image, leveraging both the text prompt and all generated images from the previous iteration. Central to our framework is a training-free global reference cross-attention module, which aggregates all generated images from the previous iteration to preserve semantic consistency across the entire story, while minimizing computational costs with global embeddings. This iterative process progressively optimizes image generation by repeatedly incorporating text constraints, resulting in more precise and fine-grained interactions. Extensive experiments validate the superiority of Story-Adapter in improving both semantic consistency and generative capability for fine-grained interactions, particularly in long story scenarios. The project page and associated code can be accessed via https://jwmao1.github.io/storyadapter.",
          "https://arxiv.org/abs/2410.06244",
          "https://jwmao1.github.io/storyadapter"
          )

        // add_paper("Efficient MedSAMs: Segment Anything in Medical Images on Laptop",
        //   "Jun Ma, Feifei Li, Sumin Kim, Reza Asakereh, Bao-Hiep Le, Dang-Khoa Nguyen-Vu, Alexander Pfefferle, Muxin Wei, Ruochen Gao, Donghang Lyu, Songxiao Yang, Lennart Purucker, Zdravko Marinov, Marius Staring, Haisheng Lu, Thuy Thanh Dao, Xincheng Ye, Zhi Li, Gianluca Brugnara, Philipp Vollmuth, Martha Foltyn-Dumitru, Jaeyoung Cho, Mustafa Ahmed Mahmutoglu, Martin Bendszus, Irada Pflüger, Aditya Rastogi, Dong Ni, Xin Yang, Guang-Quan Zhou, Kaini Wang, Nicholas Heller, Nikolaos Papanikolopoulos, Christopher Weight, Yubing Tong, Jayaram K Udupa, Cahill J Patrick, Yaqi Wang, Yifan Zhang, Francisco Contijoch, Elliot McVeigh, Xin Ye, Shucheng He, Robert Haase, Thomas Pinetz, Alexander Radbruch, Inga Krause, Erich Kobler, Jian He, Yucheng Tang, Haichun Yang, Yuankai Huo, Gongning Luo, Kaisar Kushibar, Jandos Amankulov, Dias Toleshbayev, Amangeldi Mukhamejan, Jan Egger, Antonio Pepe, Christina Gsaxner, Gijs Luijten, Shohei Fujita, Tomohiro Kikuchi, Benedikt Wiestler, Jan S Kirschke, Ezequiel de la Rosa, Federico Bolelli, Luca Lumetti, Costantino Grana, Kunpeng Xie, Guomin Wu, Behrus Puladi, Carlos Martín-Isla, Karim Lekadir, Victor M Campello, Wei Shao, Wayne Brisbane, Hongxu Jiang, Hao Wei, Wu Yuan, Shuangle Li, Yuyin Zhou, Bo Wang",
        //   null,
        //   "https://arxiv.org/abs/2412.16085",
        //   "@article{ma2024efficient,<br>" +
        //   "&nbsp;&nbsp;&nbsp;title   = {Efficient MedSAMs: Segment Anything in Medical Images on Laptop},<br>" +
        //   "&nbsp;&nbsp;&nbsp;author  = {Jun Ma, Feifei Li, Sumin Kim, Reza Asakereh, Bao-Hiep Le, Dang-Khoa Nguyen-Vu, Alexander Pfefferle, Muxin Wei, Ruochen Gao, Donghang Lyu, Songxiao Yang, Lennart Purucker, Zdravko Marinov, Marius Staring, Haisheng Lu, Thuy Thanh Dao, Xincheng Ye, Zhi Li, Gianluca Brugnara, Philipp Vollmuth, Martha Foltyn-Dumitru, Jaeyoung Cho, Mustafa Ahmed Mahmutoglu, Martin Bendszus, Irada Pflüger, Aditya Rastogi, Dong Ni, Xin Yang, Guang-Quan Zhou, Kaini Wang, Nicholas Heller, Nikolaos Papanikolopoulos, Christopher Weight, Yubing Tong, Jayaram K Udupa, Cahill J Patrick, Yaqi Wang, Yifan Zhang, Francisco Contijoch, Elliot McVeigh, Xin Ye, Shucheng He, Robert Haase, Thomas Pinetz, Alexander Radbruch, Inga Krause, Erich Kobler, Jian He, Yucheng Tang, Haichun Yang, Yuankai Huo, Gongning Luo, Kaisar Kushibar, Jandos Amankulov, Dias Toleshbayev, Amangeldi Mukhamejan, Jan Egger, Antonio Pepe, Christina Gsaxner, Gijs Luijten, Shohei Fujita, Tomohiro Kikuchi, Benedikt Wiestler, Jan S Kirschke, Ezequiel de la Rosa, Federico Bolelli, Luca Lumetti, Costantino Grana, Kunpeng Xie, Guomin Wu, Behrus Puladi, Carlos Martín-Isla, Karim Lekadir, Victor M Campello, Wei Shao, Wayne Brisbane, Hongxu Jiang, Hao Wei, Wu Yuan, Shuangle Li, Yuyin Zhou, Bo Wang},<br>" +
        //   "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2412.16085},<br>" +
        //   "&nbsp;&nbsp;&nbsp;year    = {2024},<br>",
        //   "Promptable segmentation foundation models have emerged as a transformative approach to addressing the diverse needs in medical images, but most existing models require expensive computing, posing a big barrier to their adoption in clinical practice. In this work, we organized the first international competition dedicated to promptable medical image segmentation, featuring a large-scale dataset spanning nine common imaging modalities from over 20 different institutions. The top teams developed lightweight segmentation foundation models and implemented an efficient inference pipeline that substantially reduced computational requirements while maintaining state-of-the-art segmentation accuracy. Moreover, the post-challenge phase advanced the algorithms through the design of performance booster and reproducibility tasks, resulting in improved algorithms and validated reproducibility of the winning solution. Furthermore, the best-performing algorithms have been incorporated into the open-source software with a user-friendly interface to facilitate clinical adoption. The data and code are publicly available to foster the further development of medical image segmentation foundation models and pave the way for impactful real-world applications.",
        //   "https://arxiv.org/abs/2412.16085",
        //   )



        add_paper("A Preliminary Study of o1 in Medicine: Are We Closer to an AI Doctor?",
            "Yunfei Xie, Juncheng Wu, Haoqin Tu, Siwei Yang, Bingchen Zhao, Yongshuo Zong, Qiao Jin, Cihang Xie, Yuyin Zhou",
            null,
            "https://arxiv.org/abs/2409.15277",
            "@article{xie2024preliminarystudyo1medicine,<br>" +
            "&nbsp;&nbsp;&nbsp;title   = {A Preliminary Study of o1 in Medicine: Are We Closer to an AI Doctor?},<br>" +
            "&nbsp;&nbsp;&nbsp;author  = {Yunfei Xie, Juncheng Wu, Haoqin Tu, Siwei Yang, Bingchen Zhao, Yongshuo Zong, Qiao Jin, Cihang Xie, Yuyin Zhou},<br>" +
            "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2409.15277},<br>" +
            "&nbsp;&nbsp;&nbsp;year    = {2024},<br>",
            "Large language models (LLMs) have exhibited remarkable capabilities across various domains and tasks, pushing the boundaries of our knowledge in learning and cognition. The latest model, OpenAI's o1, stands out as the first LLM with an internalized chain-of-thought technique using reinforcement learning strategies. While it has demonstrated surprisingly strong capabilities on various general language tasks, its performance in specialized fields such as medicine remains unknown. To this end, this report provides a comprehensive exploration of o1 on different medical scenarios, examining 3 key aspects: understanding, reasoning, and multilinguality. Specifically, our evaluation encompasses 6 tasks using data from 37 medical datasets, including two newly constructed and more challenging question-answering (QA) tasks based on professional medical quizzes from the New England Journal of Medicine (NEJM) and The Lancet. These datasets offer greater clinical relevance compared to standard medical QA benchmarks such as MedQA, translating more effectively into real-world clinical utility. Our analysis of o1 suggests that the enhanced reasoning ability of LLMs may (significantly) benefit their capability to understand various medical instructions and reason through complex clinical scenarios. Notably, o1 surpasses the previous GPT-4 in accuracy by an average of 6.2% and 6.6% across 19 datasets and two newly created complex QA scenarios. But meanwhile, we identify several weaknesses in both the model capability and the existing evaluation protocols, including hallucination, inconsistent multilingual ability, and discrepant metrics for evaluation. We release our raw data and model outputs https://ucsc-vlaa.github.io/o1_medicine/ for future research.",
            "https://arxiv.org/abs/2409.15277",
            "https://ucsc-vlaa.github.io/o1_medicine/"
            )


        add_paper("Restorer: Removing Multi-Degradation with All-Axis Attention and Prompt Guidance",
          "Jiawei Mao, Juncheng Wu, Yuyin Zhou, Xuesong Yin, Yuanqi Chang",
          null,
          "https://arxiv.org/abs/2406.12587",
          "@article{mao2024restorer,<br>" +
          "&nbsp;&nbsp;&nbsp;title = {Restorer: Removing Multi-Degradation with All-Axis Attention and Prompt Guidance},<br>" +
          "&nbsp;&nbsp;&nbsp;author = {Mao, Jiawei and Wu, Juncheng and Zhou, Yuyin and Yin, Xuesong and Chang, Yuanqi},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2406.12587},<br>" +
          "&nbsp;&nbsp;&nbsp;year = {2024}<br>}",
          "There are many excellent solutions in image restoration. However, most methods require on training separate models to restore images with different types of degradation. Although existing all-in-one models effectively address multiple types of degradation simultaneously, their performance in real-world scenarios is still constrained by the task confusion problem. In this work, we attempt to address this issue by introducing Restorer, a novel Transformer-based allin-one image restoration model. To effectively address the complex degradation present in real-world images, we propose All-Axis Attention (AAA), a novel attention mechanism that simultaneously models long-range dependencies across both spatial and channel dimensions, capturing potential correlations along all axes. Additionally, we introduce textual prompts in Restorer to incorporate explicit task priors, enabling the removal of specific degradation types based on user instructions. By iterating over these prompts, Restorer can handle composite degradation in real-world scenarios without requiring additional training. Based on these designs, Restorer with one set of parameters demonstrates state-of-theart performance in multiple image restoration tasks compared to existing all-in-one and even single-task models. Additionally, Restorer is efficient during inference, suggesting the potential in real-world applications. Code will be available at https://github.com/Talented-Q/Restorer.",
          "https://arxiv.org/abs/2406.12587",
          "https://github.com/Talented-Q/Restorer."
        )


        

        add_paper("VideoLLaMB: Long-context Video Understanding with Recurrent Memory Bridges",
          "Yuxuan Wang, Cihang Xie, Yang Liu, Zilong Zheng",
          null,
          "https://arxiv.org/abs/2409.01071",
          "@article{wang2024videollamb,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {VideoLLaMB: Long-context Video Understanding with Recurrent Memory Bridges},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Yuxuan Wang, Cihang Xie, Yang Liu, Zilong Zheng},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2409.01071},<br>" +
          "&nbsp;&nbsp;&nbsp;year    = {2024},<br>",
          "Recent advancements in large-scale video-language models have shown significant potential for real-time planning and detailed interactions. However, their high computational demands and the scarcity of annotated datasets limit their practicality for academic researchers. In this work, we introduce VideoLLaMB, a novel framework that utilizes temporal memory tokens within bridge layers to allow for the encoding of entire video sequences alongside historical visual data, effectively preserving semantic continuity and enhancing model performance across various tasks. This approach includes recurrent memory tokens and a SceneTilling algorithm, which segments videos into independent semantic units to preserve semantic integrity. Empirically, VideoLLaMB significantly outstrips existing video-language models, demonstrating a 5.5 points improvement over its competitors across three VideoQA benchmarks, and 2.06 points on egocentric planning. Comprehensive results on the MVBench show that VideoLLaMB-7B achieves markedly better results than previous 7B models of same LLM. Remarkably, it maintains robust performance as PLLaVA even as video length increases up to 8 times. Besides, the frame retrieval results on our specialized Needle in a Video Haystack (NIAVH) benchmark, further validate VideoLLaMB's prowess in accurately identifying specific frames within lengthy videos. Our SceneTilling algorithm also enables the generation of streaming video captions directly, without necessitating additional training. In terms of efficiency, VideoLLaMB, trained on 16 frames, supports up to 320 frames on a single Nvidia A100 GPU with linear GPU memory scaling, ensuring both high performance and cost-effectiveness, thereby setting a new foundation for long-form video-language models in both academic and practical applications.",
          "https://arxiv.org/abs/2409.01071",
          "https://videollamb.github.io/"
          )

        
        add_paper("What If We Recaption Billions of Web Images with LLaMA-3",
          "Xianhang Li, Haoqin Tu, Mude Hui, Zeyu Wang, Bingchen Zhao, Junfei Xiao, Sucheng Ren, Jieru Mei, Qing Liu, Huangjie Zheng, Yuyin Zhou, Cihang Xie",
          null,
          "https://arxiv.org/abs/2406.08478",
          "@article{li2024recaption,<br>" +
           "&nbsp;&nbsp;&nbsp;title   = {What If We Recaption Billions of Web Images with LLaMA-3},<br>" +
           "&nbsp;&nbsp;&nbsp;author  = {Li, Xianhang and Tu, Haoqin and Hui, Mude and Wang, Zeyu and Zhao, Bingchen and Xiao, Junfei and Ren, Sucheng and Mei, Jieru and Liu, Qing and Zheng, Huangjie and Zhou, Yuyin and Xie, Cihang},<br>" +
           "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2406.08478},<br>" +
           "&nbsp;&nbsp;&nbsp;year    = {2024},<br>",
          "Web-crawled image-text pairs are inherently noisy. Prior studies demonstrate that semantically aligning and enriching textual descriptions of these pairs can significantly enhance model training across various vision-language tasks, particularly text-to-image generation. However, large-scale investigations in this area remain predominantly closed-source. Our paper aims to bridge this community effort, leveraging the powerful and \textit{open-sourced} LLaMA-3, a GPT-4 level LLM. Our recaptioning pipeline is simple: first, we fine-tune a LLaMA-3-8B powered LLaVA-1.5 and then employ it to recaption 1.3 billion images from the DataComp-1B dataset. Our empirical results confirm that this enhanced dataset, Recap-DataComp-1B, offers substantial benefits in training advanced vision-language models. For discriminative models like CLIP, we observe enhanced zero-shot performance in cross-modal retrieval tasks. For generative models like text-to-image Diffusion Transformers, the generated images exhibit a significant improvement in alignment with users' text instructions, especially in following complex queries. Our project page is https://www.haqtu.me/Recap-Datacomp-1B/",
          "https://arxiv.org/abs/2406.08478",
          "https://www.haqtu.me/Recap-Datacomp-1B/"
        )


        add_paper("Medical Vision Generalist: Unifying Medical Imaging Tasks in Context",
          "Sucheng Ren, Xiaoke Huang, Xianhang Li, Junfei Xiao, Jieru Mei, Zeyu Wang, Alan Yuille, Yuyin Zhou",
          null,
          "https://arxiv.org/abs/2406.05565",
          "@article{ren2024medicalvision,<br>" +
           "&nbsp;&nbsp;&nbsp;title   = {Medical Vision Generalist: Unifying Medical Imaging Tasks in Context},<br>" +
           "&nbsp;&nbsp;&nbsp;author  = {Ren, Sucheng and Huang, Xiaoke and Li, Xianhang and Xiao, Junfei and Mei, Jieru and Wang, Zeyu and Yuille, Alan and Zhou, Yuyin},<br>" +
           "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2406.05565},<br>" +
           "&nbsp;&nbsp;&nbsp;year    = {2024},<br>",
          "This study presents Medical Vision Generalist (MVG), the first foundation model capable of handling various medical imaging tasks -- such as cross-modal synthesis, image segmentation, denoising, and inpainting -- within a unified image-to-image generation framework. Specifically, MVG employs an in-context generation strategy that standardizes the handling of inputs and outputs as images. By treating these tasks as an image generation process conditioned on prompt image-label pairs and input images, this approach enables a flexible unification of various tasks, even those spanning different modalities and datasets. To capitalize on both local and global context, we design a hybrid method combining masked image modeling with autoregressive training for conditional image generation. This hybrid approach yields the most robust performance across all involved medical imaging tasks. To rigorously evaluate MVG's capabilities, we curated the first comprehensive generalist medical vision benchmark, comprising 13 datasets and spanning four imaging modalities (CT, MRI, X-ray, and micro-ultrasound). Our results consistently establish MVG's superior performance, outperforming existing vision generalists, such as Painter and LVM. Furthermore, MVG exhibits strong scalability, with its performance demonstrably improving when trained on a more diverse set of tasks, and can be effectively adapted to unseen datasets with only minimal task-specific samples. The code is available at https://github.com/OliverRensu/MVG.",
          "https://arxiv.org/abs/2406.05565",
          "https://github.com/OliverRensu/MVG"
        )


        add_paper("Fast-DDPM: Fast Denoising Diffusion Probabilistic Models for Medical Image-to-Image Generation",
          "Hongxu Jiang, Muhammad Imran, Linhai Ma, Teng Zhang, Yuyin Zhou, Muxuan Liang, Kuang Gong, Wei Shao",
          null,
          "https://arxiv.org/abs/2405.14802",
          "@article{jiang2024fastddpm,<br>" +
           "&nbsp;&nbsp;&nbsp;title   = {Fast Denoising Diffusion Probabilistic Models for Medical Image-to-Image Generation},<br>" +
           "&nbsp;&nbsp;&nbsp;author  = {Jiang, Hongxu and Imran, Muhammad and Ma, Linhai and Zhang, Teng and Zhou, Yuyin and Liang, Muxuan and Gong, Kuang and Shao, Wei},<br>" +
           "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2405.14802},<br>" +
           "&nbsp;&nbsp;&nbsp;year    = {2024},<br>",
          "Denoising diffusion probabilistic models (DDPMs) have achieved unprecedented success in computer vision. However, they remain underutilized in medical imaging, a field crucial for disease diagnosis and treatment planning. This is primarily due to the high computational cost associated with (1) the use of large number of time steps (e.g., 1,000) in diffusion processes and (2) the increased dimensionality of medical images, which are often 3D or 4D. Training a diffusion model on medical images typically takes days to weeks, while sampling each image volume takes minutes to hours. To address this challenge, we introduce Fast-DDPM, a simple yet effective approach capable of improving training speed, sampling speed, and generation quality simultaneously. Unlike DDPM, which trains the image denoiser across 1,000 time steps, Fast-DDPM trains and samples using only 10 time steps. The key to our method lies in aligning the training and sampling procedures. We introduced two efficient noise schedulers with 10 time steps: one with uniform time step sampling and another with non-uniform sampling. We evaluated Fast-DDPM across three medical image-to-image generation tasks: multi-image super-resolution, image denoising, and image-to-image translation. Fast-DDPM outperformed DDPM and current state-of-the-art methods based on convolutional networks and generative adversarial networks in all tasks. Additionally, Fast-DDPM reduced training time by a factor of 5 and sampling time by a factor of 100 compared to DDPM. Our code is publicly available at: https://github.com/mirthAI/Fast-DDPM.",
          "https://arxiv.org/abs/2405.14802",
          "https://github.com/mirthAI/Fast-DDPM"
        )

        add_paper("VideoHallucer: Evaluating Intrinsic and Extrinsic Hallucinations in Large Video-Language Models",
          "Yuxuan Wang, Yueqian Wang, Dongyan Zhao, Cihang Xie, Zilong Zheng",
          null,
          "https://arxiv.org/abs/2406.16338",
          "@article{wang2025videohallucer,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {VideoHallucer: Evaluating Intrinsic and Extrinsic Hallucinations in Large Video-Language Models},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Wang, Yuxuan and Wang, Yueqian and Zhao, Dongyan and Xie, Cihang and Zheng, Zilong},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2406.16338},<br>" +
          "&nbsp;&nbsp;&nbsp;year    = {2024}<br>",
          "Recent advancements in Multimodal Large Language Models (MLLMs) have extended their capabilities to video understanding. Yet, these models are often plagued by 'hallucinations', where irrelevant or nonsensical content is generated, deviating from the actual video context. This work introduces VideoHallucer, the first comprehensive benchmark for hallucination detection in large video-language models (LVLMs). VideoHallucer categorizes hallucinations into two main types: intrinsic and extrinsic, offering further subcategories for detailed analysis, including object-relation, temporal, semantic detail, extrinsic factual, and extrinsic non-factual hallucinations. We adopt an adversarial binary VideoQA method for comprehensive evaluation, where pairs of basic and hallucinated questions are crafted strategically. By evaluating eleven LVLMs on VideoHallucer, we reveal that i) the majority of current models exhibit significant issues with hallucinations; ii) while scaling datasets and parameters improves models' ability to detect basic visual cues and counterfactuals, it provides limited benefit for detecting extrinsic factual hallucinations; iii) existing models are more adept at detecting facts than identifying hallucinations. As a byproduct, these analyses further instruct the development of our self-PEP framework, achieving an average of 5.38% improvement in hallucination resistance across all model architectures.",
          "https://arxiv.org/abs/2406.16338",
          "https://videohallucer.github.io/"
          )



        add_paper("RetinaRegNet: A Versatile Approach for Retinal Image Registration",
          "Vishal Balaji Sivaraman, Muhammad Imran, Qingyue Wei, Preethika Muralidharan, Michelle R Tamplin, Isabella M Grumbach, Randy H Kardon, Jui-Kai Wang, Yuyin Zhou, Wei Shao",
          null,
          "https://arxiv.org/abs/2404.16017",
          "@article{sivaraman2024retinaregnet,<br>" +
           "&nbsp;&nbsp;&nbsp;title   = {RetinaRegNet: A Versatile Approach for Retinal Image Registration},<br>" +
           "&nbsp;&nbsp;&nbsp;author  = {Sivaraman, Vishal Balaji and Imran, Muhammad and Wei, Qingyue and Muralidharan, Preethika and Tamplin, Michelle R and Grumbach, Isabella M and Kardon, Randy H and Wang, Jui-Kai and Zhou, Yuyin and Shao, Wei},<br>" +
           "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2404.16017},<br>" +
           "&nbsp;&nbsp;&nbsp;year    = {2024},<br>",
          "We introduce the RetinaRegNet model, which can achieve state-of-the-art performance across various retinal image registration tasks. RetinaRegNet does not require training on any retinal images. It begins by establishing point correspondences between two retinal images using image features derived from diffusion models. This process involves the selection of feature points from the moving image using the SIFT algorithm alongside random point sampling. For each selected feature point, a 2D correlation map is computed by assessing the similarity between the feature vector at that point and the feature vectors of all pixels in the fixed image. The pixel with the highest similarity score in the correlation map corresponds to the feature point in the moving image. To remove outliers in the estimated point correspondences, we first applied an inverse consistency constraint, followed by a transformation-based outlier detector. This method proved to outperform the widely used random sample consensus (RANSAC) outlier detector by a significant margin. To handle large deformations, we utilized a two-stage image registration framework. A homography transformation was used in the first stage and a more accurate third-order polynomial transformation was used in the second stage. The model's effectiveness was demonstrated across three retinal image datasets: color fundus images, fluorescein angiography images, and laser speckle flowgraphy images. RetinaRegNet outperformed current state-of-the-art methods in all three datasets. It was especially effective for registering image pairs with large displacement and scaling deformations. This innovation holds promise for various applications in retinal image analysis. Our code is publicly available at https://github.com/mirthAI/RetinaRegNet.",
          "https://arxiv.org/abs/2404.16017",
          "https://github.com/mirthAI/RetinaRegNet"
        )






        add_paper("Audio-Visual LLM for Video Understanding",
                  "Hangxun Shu, Lei Zhang, Hao Jiang, Cihang Xie",
                  null,
                  "https://arxiv.org/abs/2312.06720",
                  "@article{shu2023audio,<br>" +
                  "&nbsp;&nbsp;&nbsp;title     = {Audio-Visual LLM for Video Understanding},<br>" +
                  "&nbsp;&nbsp;&nbsp;author    = {Shu, Fangxun and Zhang, Lei and Jiang, Hao and Xie, Cihang},<br>" +
                  "&nbsp;&nbsp;&nbsp;journal   = {arXiv preprint arXiv:2312.06720},<br>" +
                  "&nbsp;&nbsp;&nbsp;year      = {2023}<br>}",
                  "This paper presents Audio-Visual LLM, a Multimodal Large Language Model that takes both visual and auditory inputs for holistic video understanding. A key design is the modality-augmented training, which involves the integration of modality-specific tokens engineered to activate the appropriate visual and/or auditory encoder selectively. This mechanism is pivotal in enabling end-to-end joint training with video data at different modalities, including visual-only, audio-only, and audio-visual formats. Moreover, we introduce a high-quality video instruction dataset, derived from GPT-4. This dataset allows Audio-Visual LLM to adeptly process a variety of task-oriented video instructions, ranging from multi-turn conversations and audio-visual narratives to complex reasoning tasks. Extensive experiments demonstrate that Audio-Visual LLM impressively achieves strong zero-shot results across a range of video understanding tasks. For example, Audio-Visual LLM achieves an accuracy of 53.7% on MSRVTT-QA, outperforming non-LLM-based InterVideo by 6.6% and LLM-based Valley by 4.4%, respectively. Additionally, our Audio-Visual LLM also achieves competitive performance on audio tasks (e.g., AudioCaps).",
                  "https://arxiv.org/abs/2312.06720"
                  )


        add_paper("Compress & Align: Curating Image-Text Data with Human Knowledge",
                  "Lei Zhang, Fangxun Shu, Sucheng Ren, Hao Jiang, Bingchen Zhao, Cihang Xie",
                  null,
                  "https://arxiv.org/abs/2312.06726",
                  "@article{zhang2023compress,<br>" +
                  "&nbsp;&nbsp;&nbsp;title     = {Compress & Align: Curating Image-Text Data with Human Knowledge},<br>" +
                  "&nbsp;&nbsp;&nbsp;author    = {Zhang, Lei and Shu, Fangxun and Ren, Sucheng and Zhao, Bingchen and Jiang, Hao and Xie, Cihang},<br>" +
                  "&nbsp;&nbsp;&nbsp;journal   = {arXiv preprint arXiv:2312.06726},<br>" +
                  "&nbsp;&nbsp;&nbsp;year      = {2023}<br>}",
                  "The massive growth of image-text data through web crawling inherently presents the challenge of variability in data quality. This paper introduces a novel algorithm, rooted in human knowledge, to compress this vast corpus of web-crawled image-text datasets to a compact and high-quality form. Our method unfolds in three major steps. First, we collect an image-text dataset, wherein each image is associated with multiple captions sourced from diverse origins. Then, to systemically capture human preferences regarding the best caption paired with each image, we establish a comprehensive set of both subjective and objective criteria for critically guiding the alignment assessment from labelers. Lastly, we train a reward model on the annotated dataset to internalize the nuanced human understanding of image-text alignment. The resulting reward model thus can act as a human-like referee to filter misaligned/low-quality image-text pairs. Extensive experiments demonstrate that we are able to secure (or even improve) model performance by compressing the image-text datasets up to ~90%. An impressive example is that, by aggressively reducing the total training sample from 130M to 15.5M (e.g., ~9x smaller), our BLIP-B/16 models still consistently show superior performance compared with the full-size-dataset counterpart on image-text retrieval (Flickr30K, COCO) by ~2.5% in Recall@1, and on image-captioning (Nocaps, COCO) by ~10.0% in CIDEr and ~2.7% in SPICE.",
                  "https://arxiv.org/abs/2312.06726"
                  )


        


        // add_paper("3D TransUNet: Advancing Medical Image Segmentation through Vision Transformers",
        //           "Jieneng Chen, Jieru Mei, Xianhang Li, Yongyi Lu, Qihang Yu, Qingyue Wei, Xiangde Luo, Yutong Xie, Ehsan Adeli, Yan Wang, Matthew Lungren, Lei Xing, Le Lu, Alan Yuille, Yuyin Zhou",
        //           null,
        //           "https://arxiv.org/abs/2310.07781",
        //           "@article{chen2023transunet,<br>" +
        //           "&nbsp;&nbsp;&nbsp;title     = {3D TransUNet: Advancing Medical Image Segmentation through Vision Transformers},<br>" +
        //           "&nbsp;&nbsp;&nbsp;author    = {Chen, Jieneng and Mei, Jieru and Li, Xianhang and Lu, Yongyi and Yu, Qihang and Wei, Qingyue and Luo, Xiangde and Xie, Yutong and Adeli, Ehsan and Wang, Yan and Lungren, Matthew and Xing, Lei and Lu, Le and Yuille, Alan and Zhou, Yuyin},<br>" +
        //           "&nbsp;&nbsp;&nbsp;journal   = {arXiv preprint arXiv:2310.07781},<br>" +
        //           "&nbsp;&nbsp;&nbsp;year      = {2023}<br>}",
        //           "Medical image segmentation plays a crucial role in advancing healthcare systems for disease diagnosis and treatment planning. The u-shaped architecture, popularly known as U-Net, has proven highly successful for various medical image segmentation tasks. However, U-Net's convolution-based operations inherently limit its ability to model long-range dependencies effectively. To address these limitations, researchers have turned to Transformers, renowned for their global self-attention mechanisms, as alternative architectures. Our previous TransUNet, which leverages Transformers' self-attention to complement U-Net's localized information with the global context, is now extended to a 3D network. This is achieved by building upon the state-of-the-art nnU-Net architecture, fully exploring Transformers' potential in both the encoder and decoder design. We introduce a Transformer encoder for tokenizing image patches and a Transformer decoder for adaptively refining candidate regions. The Transformer encoder excels in multi-organ segmentation, while the Transformer decoder is more beneficial for small and challenging segmented targets such as tumor segmentation. Our extensive experiments showcase the significant potential of integrating a Transformer-based encoder and decoder into the u-shaped medical image segmentation architecture, with TransUNet outperforming competitors in various medical applications.",
        //           "https://arxiv.org/abs/2310.07781",
        //           "https://github.com/Beckschen/3D-TransUNet"
        //           )



 


        // add_paper("BiomedGPT: A Unified and Generalist Biomedical Generative Pre-trained Transformer for Vision, Language, and Multimodal Tasks",
        //             "Kai Zhang, Jun Yu, Zhiling Yan, Yixin Liu, Eashan Adhikarla, Sunyang Fu, Xun Chen, Chen Chen, Yuyin Zhou, Xiang Li, Lifang He, Brian D Davison, Quanzheng Li, Yong Chen, Hongfang Liu, Lichao Sun",
        //             null,
        //             "https://arxiv.org/abs/2305.17100",
        //             "@article{zhang2023biomedgpt,<br>" +
        //             "&nbsp;&nbsp;&nbsp;title = {BiomedGPT: A Unified and Generalist Biomedical Generative Pre-trained Transformer for Vision, Language, and Multimodal Tasks},<br>" +
        //             "&nbsp;&nbsp;&nbsp;author = {Zhang, Kai and Yu, Jun and Yan, Zhiling and Liu, Yixin and Adhikarla, Eashan and Fu, Sunyang and Chen, Xun and Chen, Chen and Zhou, Yuyin and Li, Xiang and He, Lifang and Davison, Brian D and Li, Quanzheng and Chen, Yong and Liu, Hongfang and Sun, Lichao},<br>" +
        //             "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2305.17100}<br>" +
        //             "&nbsp;&nbsp;&nbsp;year = {2023},<br>",
        //             "In this paper, we introduce a unified and generalist Biomedical Generative Pre-trained Transformer (BiomedGPT) model, which leverages self-supervision on large and diverse datasets to accept multi-modal inputs and perform a range of downstream tasks. Our experiments demonstrate that BiomedGPT delivers expansive and inclusive representations of biomedical data, outperforming the majority of preceding state-of-the-art models across five distinct tasks with 20 public datasets spanning over 15 unique biomedical modalities. Through the ablation study, we also showcase the efficacy of our multi-modal and multi-task pretraining approach in transferring knowledge to previously unseen data. Overall, our work presents a significant step forward in developing unified and generalist models for biomedicine, with far-reaching implications for improving healthcare outcomes.",
        //             "https://arxiv.org/abs/2305.17100",
        //             )
        
        add_paper("Distribution Aligned Diffusion and Prototype-guided network for Unsupervised Domain Adaptive Segmentation",
                    "Haipeng Zhou, Lei Zhu, Yuyin Zhou",
                    // "arxiv, 2023",
                    null,
                    "https://arxiv.org/abs/2303.12313",
                    "@article{zhou2023distribution,<br>" +
                    "&nbsp;&nbsp;&nbsp;title={Distribution Aligned Diffusion and Prototype-guided network for Unsupervised Domain Adaptive Segmentation},<br>" +
                    "&nbsp;&nbsp;&nbsp;author={Zhou, Haipeng and Zhu, Lei and Zhou, Yuyin},<br>" +
                    "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2303.12313},<br>" +
                    "&nbsp;&nbsp;&nbsp;year={2023}<br>}",
                    "The Diffusion Probabilistic Model (DPM) has emerged as a highly effective generative model in the field of computer vision. Its intermediate latent vectors offer rich semantic information, making it an attractive option for various downstream tasks such as segmentation and detection. In order to explore its potential further, we have taken a step forward and considered a more complex scenario in the medical image domain, specifically, under an unsupervised adaptation condition. To this end, we propose a Diffusion-based and Prototype-guided network (DP-Net) for unsupervised domain adaptive segmentation. Concretely, our DP-Net consists of two stages: 1) Distribution Aligned Diffusion (DADiff), which involves training a domain discriminator to minimize the difference between the intermediate features generated by the DPM, thereby aligning the inter-domain distribution; and 2) Prototype-guided Consistency Learning (PCL), which utilizes feature centroids as prototypes and applies a prototype-guided loss to ensure that the segmentor learns consistent content from both source and target domains. Our approach is evaluated on fundus datasets through a series of experiments, which demonstrate that the performance of the proposed method is reliable and outperforms state-of-the-art methods. Our work presents a promising direction for using DPM in complex medical image scenarios, opening up new possibilities for further research in medical imaging.",
                    "https://arxiv.org/abs/2303.12313"
                    )        



        add_paper("Bag of Tricks for FGSM Adversarial Training",
                    "Zichao Li, Li Liu, Zeyu Wang, Yuyin Zhou, Cihang Xie",
                    null,
                    "https://arxiv.org/abs/2209.02684",
                    "@article{li2022bag,<br>" +
                     "&nbsp;&nbsp;&nbsp;title   = {Bag of Tricks for FGSM Adversarial Training},<br>" +
                     "&nbsp;&nbsp;&nbsp;author  = {Li, Zichao and Liu, Li and Wang, Zeyu and Zhou, Yuyin and Xie, Cihang},<br>" +
                     "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2209.02684},<br>" +
                     "&nbsp;&nbsp;&nbsp;year   = {2022}<br>}",
                    "Adversarial training (AT) with samples generated by Fast Gradient Sign Method (FGSM), also known as FGSM-AT, is a computationally simple method to train robust networks. However, during its training procedure, an unstable mode of “catastrophic overfitting” has been identified in [Wong et al., 2020], where the robust accuracy abruptly drops to zero within a single training step. Existing methods use gradient regularizers or random initialization tricks to attenuate this issue, whereas they either take high computational cost or lead to lower robust accuracy. In this work, we provide the first study, which thoroughly examines a collection of tricks from three perspectives: Data Initialization, Network Structure, and Optimization, to overcome the catastrophic overfitting in FGSM-AT. Surprisingly, we find that simple tricks, i.e., a) masking partial pixels (even without randomness), b) setting a large convolution stride and smooth activation functions, or c) regularizing the weights of the first convolutional layer, can effectively tackle the overfitting issue. Extensive results on a range of network architectures validate the effectiveness of each proposed trick, and the combinations of tricks are also investigated. For example, trained with PreActResNet-18 on CIFAR-10, our method attains 49.8% accuracy against PGD-50 attacker and 46.4% accuracy against AutoAttack, demonstrating that pure FGSM-AT is capable of enabling robust learners. The code and models are publicly available at https://github. com/UCSC-VLAA/Bag-of-Tricks-for-FGSM-AT.",
                    "https://arxiv.org/abs/2209.02684",
                    "https://github. com/UCSC-VLAA/Bag-of-Tricks-for-FGSM-AT"
                )

         



        add_paper("The FELIX Project: Deep Networks To Detect Pancreatic Neoplasms",
                    "Yingda Xia, Qihang Yu, Linda Chu, Satomi Kawamoto, Seyoun Park, Fengze Liu, Jieneng Chen, Zhuotun Zhu, Bowen Li, Zongwei Zhou, Yongyi Lu, Yan Wang, Wei Shen, Lingxi Xie, Yuyin Zhou, Christopher Wolfgang, Ammar Javed, Daniel Fadaei Fouladi, Shahab Shayesteh, Jefferson Graves, Alejandra Blanco, Eva S Zinreich, Benedict Kinny-Köster, Kenneth Kinzler, Ralph H Hruban, Bert Vogelstein, Alan Yuille, Elliot K Fishman",
                    null,
                    "https://www.medrxiv.org/content/10.1101/2022.09.24.22280071v1",
                    "@article{xia2022felix,<br>" +
                     "&nbsp;&nbsp;&nbsp;title     = {The FELIX Project: Deep Networks To Detect Pancreatic Neoplasms},<br>" +
                     "&nbsp;&nbsp;&nbsp;author    = {Xia, Yingda and Yu, Qihang and Chu, Linda and Kawamoto, Satomi and Park, Seyoun and Liu, Fengze and Chen, Jieneng and Zhu, Zhuotun and Li, Bowen and Zhou, Zongwei and others},<br>" +
                     "&nbsp;&nbsp;&nbsp;journal   = {medRxiv},<br>" +
                     "&nbsp;&nbsp;&nbsp;year      = {2022},<br>",
                    "Tens of millions of abdominal images are performed with computed tomography (CT) in the U.S. each year but pancreatic cancers are sometimes not initially detected in these images. We here describe a suite of algorithms (named FELIX) that can recognize pancreatic lesions from CT images without human input. Using FELIX, >90% of patients with pancreatic ductal adenocarcinomas were detected at a specificity of >90% in patients without pancreatic disease. FELIX may be able to assist radiologists in identifying pancreatic cancers earlier, when surgery and other treatments offer more hope for long-term survival.",
                    "https://www.medrxiv.org/content/10.1101/2022.09.24.22280071v1"
                )





        // 2021 preprint

        add_paper("Radfusion: Benchmarking performance and fairness for multimodal pulmonary embolism detection from ct and ehr",
            "Yuyin Zhou, Shih-Cheng Huang, Jason Alan Fries, Alaa Youssef, Timothy J Amrhein, Marcello Chang, Imon Banerjee, Daniel Rubin, Lei Xing, Nigam Shah, Matthew P Lungren",
                    null,
            "https://arxiv.org/abs/2111.11665",
            "@article{zhou2021radfusion,<br>" +
              "&nbsp;&nbsp;&nbsp;title     = {Radfusion: Benchmarking performance and fairness for multimodal pulmonary embolism detection from ct and ehr},<br>" +
              "&nbsp;&nbsp;&nbsp;author    = {Zhou, Yuyin and Huang, Shih-Cheng and Fries, Jason Alan and Youssef, Alaa and Amrhein, Timothy J and Chang, Marcello and Banerjee, Imon and Rubin, Daniel and Xing, Lei and Shah, Nigam and others},<br>" +
              "&nbsp;&nbsp;&nbsp;journal   = {arXiv preprint arXiv:2111.11665},<br>" +
              "&nbsp;&nbsp;&nbsp;year      = {2021}<br>}",
            "Despite the routine use of electronic health record (EHR) data by radiologists to contextualize clinical history and inform image interpretation, the majority of deep learning architectures for medical imaging are unimodal, i.e., they only learn features from pixel-level information. Recent research revealing how race can be recovered from pixel data alone highlights the potential for serious biases in models which fail to account for demographics and other key patient attributes. Yet the lack of imaging datasets which capture clinical context, inclusive of demographics and longitudinal medical history, has left multimodal medical imaging underexplored. To better assess these challenges, we present RadFusion, a multimodal, benchmark dataset of 1794 patients with corresponding EHR data and high-resolution computed tomography (CT) scans labeled for pulmonary embolism. We evaluate several representative multimodal fusion models and benchmark their fairness properties across protected subgroups, e.g., gender, race/ethnicity, age. Our results suggest that integrating imaging and EHR data can improve classification performance and robustness without introducing large disparities in the true positive rate between population groups.",
            "https://arxiv.org/abs/2111.11665"
        )


        // add_paper("Transunet: Transformers make strong encoders for medical image segmentation",
        //             "Jieneng Chen, Yongyi Lu, Qihang Yu, Xiangde Luo, Ehsan Adeli, Yan Wang, Le Lu, Alan Yuille, Yuyin Zhou",
        //             // "arXiv, 2021",
        //             null,
        //             "https://arxiv.org/abs/2102.04306",
        //             "@article{chen2021transunet,<br>" +
        //              "&nbsp;&nbsp;&nbsp;title     = {Transunet: Transformers make strong encoders for medical image segmentation},<br>" +
        //              "&nbsp;&nbsp;&nbsp;author    = {Chen, Jieneng and Lu, Yongyi and Yu, Qihang and Luo, Xiangde and Adeli, Ehsan and Wang, Yan and Lu, Le and Yuille, Alan and Zhou, Yuyin},<br>" +
        //              "&nbsp;&nbsp;&nbsp;journal   = {arXiv preprint arXiv:2102.04306},<br>" +
        //              "&nbsp;&nbsp;&nbsp;year      = {2021}<br>}",
        //             "Medical image segmentation is an essential prerequisite for developing healthcare systems, especially for disease diagnosis and treatment planning. On various medical image segmentation tasks, the u-shaped architecture, also known as U-Net, has become the de-facto standard and achieved tremendous success. However, due to the intrinsic locality of convolution operations, U-Net generally demonstrates limitations in explicitly modeling long-range dependency. Transformers, designed for sequence-to-sequence prediction, have emerged as alternative architectures with innate global self-attention mechanisms, but can result in limited localization abilities due to insufficient low-level details. In this paper, we propose TransUNet, which merits both Transformers and U-Net, as a strong alternative for medical image segmentation. On one hand, the Transformer encodes tokenized image patches from a convolution neural network (CNN) feature map as the input sequence for extracting global contexts. On the other hand, the decoder upsamples the encoded features which are then combined with the high-resolution CNN feature maps to enable precise localization. We argue that Transformers can serve as strong encoders for medical image segmentation tasks, with the combination of U-Net to enhance finer details by recovering localized spatial information. TransUNet achieves superior performances to various competing methods on different medical applications including multi-organ segmentation and cardiac segmentation. Code and models are available at https://github.com/Beckschen/TransUNet.",
        //             "https://arxiv.org/abs/2102.04306",
        //             "https://github.com/Beckschen/TransUNet"
        // )


        add_paper("Can temporal information help with contrastive self-supervised learning?",
                    "Yutong Bai, Haoqi Fan, Ishan Misra, Ganesh Venkatesh, Yongyi Lu, Yuyin Zhou, Qihang Yu, Vikas Chandra, Alan Yuille",
                    null,
                    "https://arxiv.org/abs/2011.13046",
                    "@article{bai2020can,<br>" +
                     "&nbsp;&nbsp;&nbsp;title     = {Can temporal information help with contrastive self-supervised learning?},<br>" +
                     "&nbsp;&nbsp;&nbsp;author    = {Bai, Yutong and Fan, Haoqi and Misra, Ishan and Venkatesh, Ganesh and Lu, Yongyi and Zhou, Yuyin and Yu, Qihang and Chandra, Vikas and Yuille, Alan},<br>" +
                     "&nbsp;&nbsp;&nbsp;journal   = {arXiv preprint arXiv:2011.13046},<br>" +
                     "&nbsp;&nbsp;&nbsp;year      = {2020}<br>" ,
                    "Leveraging temporal information has been regarded as essential for developing video understanding models. However, how to properly incorporate temporal information into the recent successful instance discrimination based contrastive self-supervised learning (CSL) framework remains unclear. As an intuitive solution, we find that directly applying temporal augmentations does not help, or even impair video CSL in general. This counter-intuitive observation motivates us to re-design existing video CSL frameworks, for better integration of temporal knowledge. To this end, we present Temporal-aware Contrastive self-supervised learningTaCo, as a general paradigm to enhance video CSL. Specifically, TaCo selects a set of temporal transformations not only as strong data augmentation but also to constitute extra self-supervision for video understanding. By jointly contrasting instances with enriched temporal transformations and learning these transformations as self-supervised signals, TaCo can significantly enhance unsupervised video representation learning. For instance, TaCo demonstrates consistent improvement in downstream classification tasks over a list of backbones and CSL approaches. Our best model achieves 85.1% (UCF-101) and 51.6% (HMDB-51) top-1 accuracy, which is a 3% and 2.4% relative improvement over the previous state-of-the-art.",
                    "https://arxiv.org/abs/2011.13046"
        )


         add_paper("Smooth Adversarial Training",
                    "Cihang Xie, Mingxing Tan, Boqing Gong, Alan Yuille, Quoc Le",
                    null,
                    "https://arxiv.org/abs/2006.14536",
                    "@article{xie2020smooth,<br>" +
                     "&nbsp;&nbsp;&nbsp;title     = {Smooth adversarial training},<br>" +
                     "&nbsp;&nbsp;&nbsp;author    = {Xie, Cihang and Tan, Mingxing and Gong, Boqing and Yuille, Alan and Le, Quoc V},<br>" +
                     "&nbsp;&nbsp;&nbsp;journal   = {arXiv preprint arXiv:2006.14536},<br>" +
                     "&nbsp;&nbsp;&nbsp;year      = {2020}<br>}",
                    "It is commonly believed that networks cannot be both accurate and robust, that gaining robustness means losing accuracy. It is also generally believed that, unless making networks larger, network architectural elements would otherwise matter little in improving adversarial robustness. Here we present evidence to challenge these common beliefs by a careful study about adversarial training. Our key observation is that the widely-used ReLU activation function significantly weakens adversarial training due to its non-smooth nature. Hence we propose smooth adversarial training (SAT), in which we replace ReLU with its smooth approximations to strengthen adversarial training. The purpose of smooth activation functions in SAT is to allow it to f ind harder adversarial examples and compute better gradient updates during adversarial training. Compared to standard adversarial training, SAT improves adversarial robustness for “free”, i.e., no drop in accuracy and no increase in computational cost. For example, without introducing additional computations, SAT significantly enhances ResNet-50’s robustness from 33.0% to 42.3%, while also improving accuracy by 0.9% on ImageNet. SAT also works well with larger networks: it helps EfficientNet-L1 to achieve 82.2% accuracy and 58.6% robustness on ImageNet, outperforming the previous state-ofthe-art defense by 9.5% for accuracy and 11.6% for robustness. Models are available at https://github.com/ cihangxie/SmoothAdversarialTraining.",
                    "https://arxiv.org/abs/2006.14536",
                    "https://github.com/cihangxie/SmoothAdversarialTraining"
                )


</script>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.0/jquery.min.js"></script>
</details>




<!-- --------------------------------------Publications -------------------->


<script>
        paper_count = paper_count

        function add_paper(title, authors, conference, link, bib, abstract, arxiv_link, code, press, slides, talk, msg) {
            list_entry = "<li style=\"font-size:18px\">"
            if (link != null)
                list_entry += "<a href=\"" + link + "\">"
            list_entry += "<b>" + title + "</b>"
            if (link != null)
                list_entry += "</a>"
            list_entry += "<br>" + authors + ".<br>" 
            if (conference != null)
                list_entry+= "<i><font color=\" #707070\">" + conference + "</font></i>.</li>"
            if (bib != null) {
                list_entry += "<div id=\"bib" + paper_count + "\" style=\"display:none\">" + bib + "</div>"
                list_entry += "<a href=\"javascript:copy(div" + paper_count + ",bib" + paper_count + ")\"> <span class=\"label label-success\">bib</span></a>"
            }

            if (abstract != null) {
                list_entry += "<div id=\"abstract" + paper_count + "\" style=\"display:none\">" + abstract + "</div>"
                list_entry += "<a href=\"javascript:copy(div" + paper_count + ",abstract" + paper_count + ")\"> <span class=\"label label-warning\">abstract</span></a>"
            }
            if (arxiv_link != null)
                list_entry += " <a href=\"" + arxiv_link + "\"><span class=\"label label-primary\">arxiv</span></a>"

            if (code != null)
                list_entry += " <a href=\"" + code + "\"><span class=\"label label-danger\">code/models</span></a>"

            if (press != null)
                list_entry += " <a href=\"" + press + "\"><span class=\"label label-success\">press</span></a>"

            if (slides != null)
                list_entry += " <a href=\"" + slides + "\"><span class=\"label label-info\">slides/poster</span></a>"

            if (talk != null)
                list_entry += " <a href=\"" + talk + "\"><span class=\"label label-default\">talk</span></a>"

            list_entry += "<br>"

            if (msg != null)
                list_entry += "<i>" + msg + "</i>"

            list_entry += "<div id=\"div" + paper_count + "\" style=\"font-size:15px\"></div><br>"

            document.write(list_entry)

            paper_count += 1
        }

        document.write("</ul>")
        document.write("<ul>")


        document.write("</ul><br>")
        document.write("<h1>2025</h1>")
        document.write("<ul>")

        add_paper("Mamba-R: Vision Mamba ALSO Needs Registers",
          "Feng Wang, Jiahao Wang, Sucheng Ren, Guoyizhe Wei, Jieru Mei, Wei Shao, Yuyin Zhou, Alan Yuille, Cihang Xie",
          "CVPR, 2025",
          "https://arxiv.org/abs/2405.14858",
          "@article{wang2024mambar,<br>" +
           "&nbsp;&nbsp;&nbsp;title   = {Mamba-R: Vision Mamba also needs registers},<br>" +
           "&nbsp;&nbsp;&nbsp;author  = {Wang, Feng and Wang, Jiahao and Ren, Sucheng and Wei, Guoyizhe and Mei, Jieru and Shao, Wei and Zhou, Yuyin and Yuille, Alan and Xie, Cihang},<br>" +
           "&nbsp;&nbsp;&nbsp;journal = {CVPR},<br>" +
           "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "Similar to Vision Transformers, this paper identifies artifacts also present within the feature maps of Vision Mamba. These artifacts, corresponding to high-norm tokens emerging in low-information background areas of images, appear much more severe in Vision Mamba -- they exist prevalently even with the tiny-sized model and activate extensively across background regions. To mitigate this issue, we follow the prior solution of introducing register tokens into Vision Mamba. To better cope with Mamba blocks' uni-directional inference paradigm, two key modifications are introduced: 1) evenly inserting registers throughout the input token sequence, and 2) recycling registers for final decision predictions. We term this new architecture Mamba-R. Qualitative observations suggest, compared to vanilla Vision Mamba, Mamba-R's feature maps appear cleaner and more focused on semantically meaningful regions. Quantitatively, Mamba-R attains stronger performance and scales better. For example, on the ImageNet benchmark, our base-size Mamba-R attains 82.9% accuracy, significantly outperforming Vim-B's 81.8%; furthermore, we provide the first successful scaling to the large model size (i.e., with 341M parameters), attaining a competitive accuracy of 83.2% (84.5% if finetuned with 384x384 inputs). Additional validation on the downstream semantic segmentation task also supports Mamba-R's efficacy.",
          "https://arxiv.org/abs/2405.14858",
          "https://wangf3014.github.io/mambar-page/"
        )

        add_paper("Causal Image Modeling for Efficient Visual Understanding",
          "Feng Wang, Timing Yang, Yaodong Yu, Sucheng Ren, Guoyizhe Wei, Angtian Wang, Wei Shao, Yuyin Zhou, Alan Yuille, Cihang Xie",
          "CVPR, 2025",
          "https://arxiv.org/abs/2410.07599",
          "@article{wang2024causalimagemodeling,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {Causal Image Modeling for Efficient Visual Understanding},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Feng Wang, Timing Yang, Yaodong Yu, Sucheng Ren, Guoyizhe Wei, Angtian Wang, Wei Shao, Yuyin Zhou, Alan Yuille, Cihang Xie},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {CVPR},<br>" +
          "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "In this work, we present a comprehensive analysis of causal image modeling and introduce the Adventurer series models where we treat images as sequences of patch tokens and employ uni-directional language models to learn visual representations. This modeling paradigm allows us to process images in a recurrent formulation with linear complexity relative to the sequence length, which can effectively address the memory and computation explosion issues posed by high-resolution and fine-grained images. In detail, we introduce two simple designs that seamlessly integrate image inputs into the causal inference framework: a global pooling token placed at the beginning of the sequence and a flipping operation between every two layers. Extensive empirical studies demonstrate the significant efficiency and effectiveness of this causal image modeling paradigm. For example, our base-sized Adventurer model attains a competitive test accuracy of 84.0% on the standard ImageNet-1k benchmark with 216 images/s training throughput, which is 5.3 times more efficient than vision transformers to achieve the same result.",
          "https://arxiv.org/abs/2410.07599",
          "https://github.com/wangf3014/Adventurer"
          )


        add_paper("Generative Image Layer Decomposition with Visual Effects",
          "Jinrui Yang, Qing Liu, Yijun Li, Soo Ye Kim, Daniil Pakhomov, Mengwei Ren, Jianming Zhang, Zhe Lin, Cihang Xie, Yuyin Zhou",
          "CVPR, 2025",
          "https://arxiv.org/abs/2411.17864",
          "@article{yang2024generative,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {Generative Image Layer Decomposition with Visual Effects},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Jinrui Yang, Qing Liu, Yijun Li, Soo Ye Kim, Daniil Pakhomov, Mengwei Ren, Jianming Zhang, Zhe Lin, Cihang Xie, Yuyin Zhou},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {CVPR},<br>" +
          "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "Recent advancements in large generative models, particularly diffusion-based methods, have significantly enhanced the capabilities of image editing. However, achieving precise control over image composition tasks remains a challenge. Layered representations, which allow for independent editing of image components, are essential for user-driven content creation, yet existing approaches often struggle to decompose image into plausible layers with accurately retained transparent visual effects such as shadows and reflections. We propose a generative framework for image layer decomposition which outputs photorealistic clean backgrounds and high-quality transparent foregrounds with faithfully preserved visual effects. To enable effective training, we first introduce a dataset preparation pipeline that automatically scales up simulated multi-layer data with synthesized visual effects. To further enhance real-world applicability, we supplement this simulated dataset with camera-captured images containing natural visual effects. Additionally, we propose a consistency loss which enforces the model to learn accurate representations for the transparent foreground layer when ground-truth annotations are not available. Our method achieves superior quality in layer decomposition, outperforming existing approaches in object removal and spatial editing tasks across several benchmarks and multiple user studies, unlocking various creative possibilities for layer-wise image editing. The project page is https://rayjryang.github.io/LayerDecomp.",
          "https://arxiv.org/abs/2411.17864",
          "https://rayjryang.github.io/LayerDecomp"
        )


        add_paper("HQ-Edit: A High-Quality Dataset for Instruction-based Image Editing",
          "Mude Hui, Siwei Yang, Bingchen Zhao, Yichun Shi, Heng Wang, Peng Wang, Cihang Xie, Yuyin Zhou",
          "ICLR, 2025",
          "https://arxiv.org/abs/2404.09990",
          "@article{hui2024hqedit,<br>" +
           "&nbsp;&nbsp;&nbsp;title   = {HQ-Edit: A High-Quality Dataset for Instruction-based Image Editing},<br>" +
           "&nbsp;&nbsp;&nbsp;author  = {Hui, Mude and Yang, Siwei and Zhao, Bingchen and Shi, Yichun and Wang, Heng and Wang, Peng and Xie, Cihang and Zhou, Yuyin},<br>" +
           "&nbsp;&nbsp;&nbsp;journal = {ICLR},<br>" +
           "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "This study introduces HQ-Edit, a high-quality instruction-based image editing dataset with around 200,000 edits. Unlike prior approaches relying on attribute guidance or human feedback on building datasets, we devise a scalable data collection pipeline leveraging advanced foundation models, namely GPT-4V and DALL-E 3. To ensure its high quality, diverse examples are first collected online, expanded, and then used to create high-quality diptychs featuring input and output images with detailed text prompts, followed by precise alignment ensured through post-processing. In addition, we propose two evaluation metrics, Alignment and Coherence, to quantitatively assess the quality of image edit pairs using GPT-4V. HQ-Edits high-resolution images, rich in detail and accompanied by comprehensive editing prompts, substantially enhance the capabilities of existing image editing models. For example, an HQ-Edit finetuned InstructPix2Pix can attain state-of-the-art image editing performance, even surpassing those models fine-tuned with human-annotated data. The project page is https://thefllood.github.io/HQEdit_web/",
          "https://arxiv.org/abs/2404.09990",
          "https://thefllood.github.io/HQEdit_web/"
        )


        add_paper("Autoregressive Pretraining with Mamba in Vision",
          "Sucheng Ren, Xianhang Li, Haoqin Tu, Feng Wang, Fangxun Shu, Lei Zhang, Jieru Mei, Linjie Yang, Peng Wang, Heng Wang, Alan Yuille, Cihang Xie",
          "ICLR, 2025",
          "https://arxiv.org/abs/2406.07537",
          "@article{ren2024autoregressive,<br>" +
           "&nbsp;&nbsp;&nbsp;title   = {Autoregressive Pretraining with Mamba in Vision},<br>" +
           "&nbsp;&nbsp;&nbsp;author  = {Ren, Sucheng and Li, Xianhang and Tu, Haoqin and Wang, Feng and Shu, Fangxun and Zhang, Lei and Mei, Jieru and Yang, Linjie and Wang, Peng and Wang, Heng and Yuille, Alan and Xie, Cihang},<br>" +
           "&nbsp;&nbsp;&nbsp;journal = {ICLR},<br>" +
           "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
          "The vision community has started to build with the recently developed state space model, Mamba, as the new backbone for a range of tasks. This paper shows that Mamba's visual capability can be significantly enhanced through autoregressive pretraining, a direction not previously explored. Efficiency-wise, the autoregressive nature can well capitalize on the Mamba's unidirectional recurrent structure, enabling faster overall training speed compared to other training strategies like mask modeling. Performance-wise, autoregressive pretraining equips the Mamba architecture with markedly higher accuracy over its supervised-trained counterparts and, more importantly, successfully unlocks its scaling potential to large and even huge model sizes. For example, with autoregressive pretraining, a base-size Mamba attains 83.2% ImageNet accuracy, outperforming its supervised counterpart by 2.0%; our huge-size Mamba, the largest Vision Mamba to date, attains 85.0% ImageNet accuracy (85.5% when finetuned with 384x384 inputs), notably surpassing all other Mamba variants in vision. The code is available at https://github.com/OliverRensu/ARM.",
          "https://arxiv.org/abs/2406.07537",
          "https://github.com/OliverRensu/ARM"
        )


        add_paper("MedTrinity-25M: A Large-scale Multimodal Dataset with Multigranular Annotations for Medicine",
        "Yunfei Xie, Ce Zhou, Lang Gao, Juncheng Wu, Xianhang Li, Hong-Yu Zhou, Sheng Liu, Lei Xing, James Zou, Cihang Xie, Yuyin Zhou",
        "ICLR, 2025",
        "https://arxiv.org/abs/2408.02900",
        "@article{xie2024medtrinity,<br>" +
        "&nbsp;&nbsp;&nbsp;title   = {MedTrinity-25M: A Large-scale Multimodal Dataset with Multigranular Annotations for Medicine},<br>" +
        "&nbsp;&nbsp;&nbsp;author  = {Yunfei Xie, Ce Zhou, Lang Gao, Juncheng Wu, Xianhang Li, Hong-Yu Zhou, Sheng Liu, Lei Xing, James Zou, Cihang Xie, Yuyin Zhou},<br>" +
        "&nbsp;&nbsp;&nbsp;journal = {ICLR},<br>" +
        "&nbsp;&nbsp;&nbsp;year    = {2025},<br>",
        "This paper introduces MedTrinity-25M, a comprehensive, large-scale multimodal dataset for medicine, covering over 25 million images across 10 modalities, with multigranular annotations for more than 65 diseases. These enriched annotations encompass both global textual information, such as disease/lesion type, modality, region-specific descriptions, and inter-regional relationships, as well as detailed local annotations for regions of interest (ROIs), including bounding boxes, segmentation masks. Unlike existing approach which is limited by the availability of image-text pairs, we have developed the first automated pipeline that scales up multimodal data by generating multigranular visual and texual annotations (in the form of image-ROI-description triplets) without the need for any paired text descriptions. Specifically, data from over 90 different sources have been collected, preprocessed, and grounded using domain-specific expert models to identify ROIs related to abnormal regions. We then build a comprehensive knowledge base and prompt multimodal large language models to perform retrieval-augmented generation with the identified ROIs as guidance, resulting in multigranular texual descriptions. Compared to existing datasets, MedTrinity-25M provides the most enriched annotations, supporting a comprehensive range of multimodal tasks such as captioning and report generation, as well as vision-centric tasks like classification and segmentation. This dataset can be utilized to support large-scale pre-training of multimodal medical AI models, contributing to the development of future foundation models in the medical domain.",
        "https://arxiv.org/abs/2408.02900",
        "https://yunfeixie233.github.io/MedTrinity-25M/"
        )


        add_paper("A New Federated Learning Framework Against Gradient Inversion Attacks",
           "Pengxin Guo, Shuang Zeng, Wenhao Chen, Xiaodan Zhang, Weihong Ren, Yuyin Zhou, Liangqiong Qu",
           "AAAI, 2025",
           "https://arxiv.org/abs/2412.07187",
           "@inproceedings{guo2023new,<br>" +
           "&nbsp;&nbsp;&nbsp;title     = {A New Federated Learning Framework Against Gradient Inversion Attacks},<br>" +
           "&nbsp;&nbsp;&nbsp;author    = {Guo, Pengxin and Zeng, Shuang and Chen, Wenhao and Zhang, Xiaodan and Ren, Weihong and Zhou, Yuyin and Qu, Liangqiong},<br>" +
           "&nbsp;&nbsp;&nbsp;booktitle = {AAAI},<br>" +
           "&nbsp;&nbsp;&nbsp;year      = {2025}<br>}",
           "Federated Learning (FL) aims to protect data privacy by enabling clients to collectively train machine learning models without sharing their raw data. However, recent studies demonstrate that information exchanged during FL is subject to Gradient Inversion Attacks (GIA) and, consequently, a variety of privacy-preserving methods have been integrated into FL to thwart such attacks, such as Secure Multi-party Computing (SMC), Homomorphic Encryption (HE), and Differential Privacy (DP). Despite their ability to protect data privacy, these approaches inherently involve substantial privacy-utility trade-offs. By revisiting the key to privacy exposure in FL under GIA, which lies in the frequent sharing of model gradients that contain private data, we take a new perspective by designing a novel privacy preserve FL framework that effectively ``breaks the direct connection'' between the shared parameters and the local private data to defend against GIA. Specifically, we propose a Hypernetwork Federated Learning (HyperFL) framework that utilizes hypernetworks to generate the parameters of the local model and only the hypernetwork parameters are uploaded to the server for aggregation. Theoretical analyses demonstrate the convergence rate of the proposed HyperFL, while extensive experimental results show the privacy-preserving capability and comparable performance of HyperFL. Code is available at https://github.com/Pengxin-Guo/HyperFL.",
           "https://arxiv.org/abs/2412.07187",
           "https://github.com/Pengxin-Guo/HyperFL"
        )


        add_paper("ARVideo: Autoregressive Pretraining for Self-Supervised Video Representation Learning",
          "Sucheng Ren, Hongru Zhu, Chen Wei, Yijiang Li, Alan Yuille, Cihang Xie",
          "TMLR, 2025",
          "https://arxiv.org/abs/2405.15160",
          "@article{ren2024arvideo,<br>" +
           "&nbsp;&nbsp;&nbsp;title   = {ARVideo: Autoregressive Pretraining for Self-Supervised Video Representation Learning},<br>" +
           "&nbsp;&nbsp;&nbsp;author  = {Ren, Sucheng and Zhu, Hongru and Wei, Chen and Li, Yijiang and Yuille, Alan and Xie, Cihang},<br>" +
           "&nbsp;&nbsp;&nbsp;journal = {TMLR},<br>" +
           "&nbsp;&nbsp;&nbsp;year    = {2025}<br>}",
          "This paper presents a new self-supervised video representation learning framework, ARVideo, which autoregressively predicts the next video token in a tailored sequence order. Two key designs are included. First, we organize autoregressive video tokens into clusters that span both spatially and temporally, thereby enabling a richer aggregation of contextual information compared to the standard spatial-only or temporal-only clusters. Second, we adopt a randomized spatiotemporal prediction order to facilitate learning from multi-dimensional data, addressing the limitations of a handcrafted spatial-first or temporal-first sequence order. Extensive experiments establish ARVideo as an effective paradigm for self-supervised video representation learning. For example, when trained with the ViT-B backbone, ARVideo competitively attains 81.2% on Kinetics-400 and 70.9% on Something-Something V2, which are on par with the strong benchmark set by VideoMAE. Importantly, ARVideo also demonstrates higher training efficiency, i.e., it trains 14% faster and requires 58% less GPU memory compared to VideoMAE.",
          "https://arxiv.org/abs/2405.15160",
        )


        add_paper("AttnGCG: Enhancing Jailbreaking Attacks on LLMs with Attention Manipulation",
          "Zijun Wang, Haoqin Tu, Jieru Mei, Bingchen Zhao, Yisen Wang, Cihang Xie",
          "TMLR, 2025",
          "https://arxiv.org/abs/2410.09040",
          "@article{wang2024attngcg,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {AttnGCG: Enhancing Jailbreaking Attacks on LLMs with Attention Manipulation},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Zijun Wang, Haoqin Tu, Jieru Mei, Bingchen Zhao, Yisen Wang, Cihang Xie},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {TMLR},<br>" +
          "&nbsp;&nbsp;&nbsp;year    = {2025}<br>}",
          "This paper studies the vulnerabilities of transformer-based Large Language Models (LLMs) to jailbreaking attacks, focusing specifically on the optimization-based Greedy Coordinate Gradient (GCG) strategy. We first observe a positive correlation between the effectiveness of attacks and the internal behaviors of the models. For instance, attacks tend to be less effective when models pay more attention to system prompts designed to ensure LLM safety alignment. Building on this discovery, we introduce an enhanced method that manipulates models' attention scores to facilitate LLM jailbreaking, which we term AttnGCG. Empirically, AttnGCG shows consistent improvements in attack efficacy across diverse LLMs, achieving an average increase of ~7% in the Llama-2 series and ~10% in the Gemma series. Our strategy also demonstrates robust attack transferability against both unseen harmful goals and black-box LLMs like GPT-3.5 and GPT-4. Moreover, we note our attention-score visualization is more interpretable, allowing us to gain better insights into how our targeted attention manipulation facilitates more effective jailbreaking. We release the code at https://github.com/UCSC-VLAA/AttnGCG-attack.",
          "https://arxiv.org/abs/2410.09040",
          "https://github.com/UCSC-VLAA/AttnGCG-attack"
          )


        add_paper("SPFormer: Enhancing Vision Transformer with Superpixel Representation",
          "Jieru Mei, Liang-Chieh Chen, Alan Yuille, Cihang Xie",
          "TMLR, 2025",
          "https://arxiv.org/abs/2401.02931",
          "@article{mei2024spformer,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {SPFormer: Enhancing Vision Transformer with Superpixel Representation},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Mei, Jieru and Chen, Liang-Chieh and Yuille, Alan and Xie, Cihang},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {TMLR},<br>" +
          "&nbsp;&nbsp;&nbsp;year    = {2025}<br>}",
          "In this work, we introduce SPFormer, a novel Vision Transformer enhanced by superpixel representation. Addressing the limitations of traditional Vision Transformers' fixed-size, non-adaptive patch partitioning, SPFormer employs superpixels that adapt to the image's content. This approach divides the image into irregular, semantically coherent regions, effectively capturing intricate details and applicable at both initial and intermediate feature levels. SPFormer, trainable end-to-end, exhibits superior performance across various benchmarks. Notably, it exhibits significant improvements on the challenging ImageNet benchmark, achieving a 1.4% increase over DeiT-T and 1.1% over DeiT-S respectively. A standout feature of SPFormer is its inherent explainability. The superpixel structure offers a window into the model's internal processes, providing valuable insights that enhance the model's interpretability. This level of clarity significantly improves SPFormer's robustness, particularly in challenging scenarios such as image rotations and occlusions, demonstrating its adaptability and resilience.",
          "https://arxiv.org/abs/2401.02931"
        )




        add_paper("AQA-Bench: An Interactive Benchmark for Evaluating LLMs' Sequential Reasoning Ability",
          "Siwei Yang, Bingchen Zhao, Cihang Xie",
          "TMLR, 2025",
          "https://arxiv.org/abs/2402.09404",
          "@article{yang2024aqabench,<br>" +
           "&nbsp;&nbsp;&nbsp;title   = {AQA-Bench: An Interactive Benchmark for Evaluating LLMs' Sequential Reasoning Ability},<br>" +
           "&nbsp;&nbsp;&nbsp;author  = {Yang, Siwei and Zhao, Bingchen and Xie, Cihang},<br>" +
           "&nbsp;&nbsp;&nbsp;journal = {TMLR},<br>" +
           "&nbsp;&nbsp;&nbsp;year    = {2025}<br>}",
          "This paper introduces AQA-Bench, a novel benchmark to assess the sequential reasoning capabilities of large language models (LLMs) in algorithmic contexts, such as depth-first search (DFS). The key feature of our evaluation benchmark lies in its interactive evaluation protocol -- for example, in DFS, the availability of each node's connected edge is contingent upon the model's traversal to that node, thereby necessitating the LLM's ability to effectively remember visited nodes and strategize subsequent moves. We comprehensively build AQA-Bench with three different algorithms, namely binary search, depth-first search, and breadth-first search, and to evaluate the sequential reasoning ability of 12 different LLMs. Our investigations reveal several interesting findings: (1) Closed-source models like GPT-4 and Gemini generally show strong sequential reasoning ability, significantly outperforming open-source LLMs. (2) Naively providing interactive examples may inadvertently hurt few-shot performance. (3) A very limited number of predecessor steps following the optimal policy can substantially boost small models' performance. (4) The scaling correlation between performance and model size is not always significant, sometimes even showcasing an inverse trend. We hope our study can catalyze future work on advancing the understanding and enhancement of LLMs' capabilities in sequential reasoning. The code is available at https://github.com/UCSC-VLAA/AQA-Bench",
          "https://arxiv.org/abs/2402.09404",
          "https://github.com/UCSC-VLAA/AQA-Bench"
        )




        document.write("</ul><br>")
        document.write("<h1>2024</h1>")
        document.write("<ul>")

        add_paper("VHELM: A Holistic Evaluation of Vision Language Models",
            "Tony Lee, Haoqin Tu, Chi Heem Wong, Wenhao Zheng, Yiyang Zhou, Yifan Mai, Josselin Somerville Roberts, Michihiro Yasunaga, Huaxiu Yao, Cihang Xie, Percy Liang",
            "NeurIPS (D\&B Track), 2024",
            "https://arxiv.org/abs/2410.07112",
            "@article{tony2024vhelm,<br>" +
            "&nbsp;&nbsp;&nbsp;title   = {VHELM: A Holistic Evaluation of Vision Language Models},<br>" +
            "&nbsp;&nbsp;&nbsp;author  = {Tony Lee, Haoqin Tu, Chi Heem Wong, Wenhao Zheng, Yiyang Zhou, Yifan Mai, Josselin Somerville Roberts, Michihiro Yasunaga, Huaxiu Yao, Cihang Xie, Percy Liang},<br>" +
            "&nbsp;&nbsp;&nbsp;journal = {arXiv preprint arXiv:2410.07112},<br>" +
            "&nbsp;&nbsp;&nbsp;year    = {2024},<br>",
            "Current benchmarks for assessing vision-language models (VLMs) often focus on their perception or problem-solving capabilities and neglect other critical aspects such as fairness, multilinguality, or toxicity. Furthermore, they differ in their evaluation procedures and the scope of the evaluation, making it difficult to compare models. To address these issues, we extend the HELM framework to VLMs to present the Holistic Evaluation of Vision Language Models (VHELM). VHELM aggregates various datasets to cover one or more of the 9 aspects: visual perception, knowledge, reasoning, bias, fairness, multilinguality, robustness, toxicity, and safety. In doing so, we produce a comprehensive, multi-dimensional view of the capabilities of the VLMs across these important factors. In addition, we standardize the standard inference parameters, methods of prompting, and evaluation metrics to enable fair comparisons across models. Our framework is designed to be lightweight and automatic so that evaluation runs are cheap and fast. Our initial run evaluates 22 VLMs on 21 existing datasets to provide a holistic snapshot of the models. We uncover new key findings, such as the fact that efficiency-focused models (e.g., Claude 3 Haiku or Gemini 1.5 Flash) perform significantly worse than their full models (e.g., Claude 3 Opus or Gemini 1.5 Pro) on the bias benchmark but not when evaluated on the other aspects. For transparency, we release the raw model generations and complete results on our website (https://crfm.stanford.edu/helm/vhelm/v2.0.1/). VHELM is intended to be a living benchmark, and we hope to continue adding new datasets and models over time.",
            "https://arxiv.org/abs/2410.07112",
            "https://github.com/stanford-crfm/helm"
            )

        add_paper("DDR: Exploiting Deep Degradation Response as Flexible Image Descriptor",
            "Juncheng Wu, Zhangkai Ni, Hanli Wang, Wenhan Yang, Yuyin Zhou, Shiqi Wang",
            "NeurIPS, 2024",
            "https://arxiv.org/abs/2406.08377",
            "@article{wu2024ddr,<br>" +
            "&nbsp;&nbsp;&nbsp;title     = {DDR: Exploiting Deep Degradation Response as Flexible Image Descriptor},<br>" +
            "&nbsp;&nbsp;&nbsp;author    = {Wu, Juncheng and Ni, Zhangkai and Wang, Hanli and Yang, Wenhan and Zhou, Yuyin and Wang, Shiqi},<br>" +
            "&nbsp;&nbsp;&nbsp;journal   = {NeurIPS},<br>" +
            "&nbsp;&nbsp;&nbsp;year      = {2024}<br>",
            "Image deep features extracted by pre-trained networks are known to contain rich and informative representations. In this paper, we present Deep Degradation Response (DDR), a method to quantify changes in image deep features under varying degradation conditions. Specifically, our approach facilitates flexible and adaptive degradation, enabling the controlled synthesis of image degradation through text-driven prompts. Extensive evaluations demonstrate the versatility of DDR as an image descriptor, with strong correlations observed with key image attributes such as complexity, colorfulness, sharpness, and overall quality. Moreover, we demonstrate the efficacy of DDR across a spectrum of applications. It excels as a blind image quality assessment metric, outperforming existing methodologies across multiple datasets. Additionally, DDR serves as an effective unsupervised learning objective in image restoration tasks, yielding notable advancements in image deblurring and single-image super-resolution.",
            "https://arxiv.org/abs/2406.08377",
            "https://github.com/eezkni/DDR"
            )

        add_paper("Scaling White-Box Transformers for Vision",
            "Jinrui Yang, Xianhang Li, Druv Pai, Yuyin Zhou, Yi Ma, Yaodong Yu, Cihang Xie",
            "NeurIPS, 2024",
            "https://arxiv.org/abs/2405.20299",
            "@article{yang2024scaling,<br>" +
             "&nbsp;&nbsp;&nbsp;title   = {Scaling White-Box Transformers for Vision},<br>" +
             "&nbsp;&nbsp;&nbsp;author  = {Yang, Jinrui and Li, Xianhang and Pai, Druv and Zhou, Yuyin and Ma, Yi and Yu, Yaodong and Xie, Cihang},<br>" +
             "&nbsp;&nbsp;&nbsp;journal = {NeurIPS},<br>" +
             "&nbsp;&nbsp;&nbsp;year    = {2024},<br>",
            "CRATE, a white-box transformer architecture designed to learn compressed and sparse representations, offers an intriguing alternative to standard vision transformers (ViTs) due to its inherent mathematical interpretability. Despite extensive investigations into the scaling behaviors of language and vision transformers, the scalability of CRATE remains an open question which this paper aims to address. Specifically, we propose CRATE-α, featuring strategic yet minimal modifications to the sparse coding block in the CRATE architecture design, and a light training recipe designed to improve the scalability of CRATE. Through extensive experiments, we demonstrate that CRATE-α can effectively scale with larger model sizes and datasets. For example, our CRATE-α-B substantially outperforms the prior best CRATE-B model accuracy on ImageNet classification by 3.7%, achieving an accuracy of 83.2%. Meanwhile, when scaling further, our CRATE-α-L obtains an ImageNet classification accuracy of 85.1%. More notably, these model performance improvements are achieved while preserving, and potentially even enhancing the interpretability of learned CRATE models, as we demonstrate through showing that the learned token representations of increasingly larger trained CRATE-α models yield increasingly higher-quality unsupervised object segmentation of images. The project page is https://rayjryang.github.io/CRATE-alpha/.",
            "https://arxiv.org/abs/2405.20299",
            "https://rayjryang.github.io/CRATE-alpha/"
            )


        add_paper("A Semantic Space is Worth 256 Language Descriptions: Make Stronger Segmentation Models with Descriptive Properties",
                  "Junfei Xiao, Ziqi Zhou, Wenxuan Li, Shiyi Lan, Jieru Mei, Zhiding Yu, Bingchen Zhao, Alan Yuille, Yuyin Zhou, Cihang Xie",
                  "ECCV, 2024",
                  "https://arxiv.org/abs/2312.13764",
                  "@inproceedings{xiao2023semantic,<br>" +
                  "&nbsp;&nbsp;&nbsp;title     = {A Semantic Space is Worth 256 Language Descriptions: Make Stronger Segmentation Models with Descriptive Properties},<br>" +
                  "&nbsp;&nbsp;&nbsp;author    = {Xiao, Junfei and Zhou, Ziqi and Li, Wenxuan and Lan, Shiyi and Mei, Jieru and Yu, Zhiding and Yuille, Alan and Zhou, Yuyin and Xie, Cihang},<br>" +
                  "&nbsp;&nbsp;&nbsp;booktitle = {ECCV},<br>" +
                  "&nbsp;&nbsp;&nbsp;year      = {2024}<br>}",
                  "This paper introduces ProLab, a novel approach using property-level label space for creating strong interpretable segmentation models. Instead of relying solely on category-specific annotations, ProLab uses descriptive properties grounded in common sense knowledge for supervising segmentation models. It is based on two core designs. First, we employ Large Language Models (LLMs) and carefully crafted prompts to generate descriptions of all involved categories that carry meaningful common sense knowledge and follow a structured format. Second, we introduce a description embedding model preserving semantic correlation across descriptions and then cluster them into a set of descriptive properties (e.g., 256) using K-Means. These properties are based on interpretable common sense knowledge consistent with theories of human recognition. We empirically show that our approach makes segmentation models perform stronger on five classic benchmarks (e.g., ADE20K, COCO-Stuff, Pascal Context, Cityscapes, and BDD). Our method also shows better scalability with extended training steps than category-level supervision. Our interpretable segmentation framework also emerges with the generalization ability to segment out-of-domain or unknown categories using only in-domain descriptive properties. Code is available at https://github.com/lambert-x/ProLab.",
                  "https://arxiv.org/abs/2312.13764",
                  "https://github.com/lambert-x/ProLab"
                  )


        add_paper("How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for Vision LLMs",
                  "Haoqin Tu, Chenhang Cui, Zijun Wang, Yiyang Zhou, Bingchen Zhao, Junlin Han, Wangchunshu Zhou, Huaxiu Yao, Cihang Xie",
                  "ECCV, 2024",
                  "https://arxiv.org/abs/2311.16101",
                  "@inproceedings{tu2023unicorns,<br>" +
                  "&nbsp;&nbsp;&nbsp;title     = {How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for Vision LLMs},<br>" +
                  "&nbsp;&nbsp;&nbsp;author    = {Tu, Haoqin and Cui, Chenhang and Wang, Zijun and Zhou, Yiyang and Zhao, Bingchen and Han, Junlin and Zhou, Wangchunshu and Yao, Huaxiu and Xie, Cihang},<br>" +
                  "&nbsp;&nbsp;&nbsp;booktitle = {ECCV},<br>" +
                  "&nbsp;&nbsp;&nbsp;year      = {2024}<br>}",
                  "This work focuses on the potential of Vision LLMs (VLLMs) in visual reasoning, shifting focus from standard performance to a comprehensive safety evaluation suite covering OOD generalization and adversarial robustness. We present two novel VQA datasets, each with a variant, to test model performance under challenging conditions. We propose an attack strategy for misleading VLLMs and assess the efficacy of jailbreaking strategies targeting the vision or language components. Our evaluation of 21 models, including open-source VLLMs and GPT-4V, reveals that current VLLMs struggle with OOD texts but not images, unless visual information is limited. These VLLMs can be easily misled by deceiving vision encoders only, and their vision-language training often compromise safety protocols. We release our safety evaluation suite at https://github.com/UCSC-VLAA/vllm-safety-benchmark.",
                  "https://arxiv.org/abs/2311.16101",
                  "https://github.com/UCSC-VLAA/vllm-safety-benchmark"
                  )


        add_paper("From Pixels to Objects: A Hierarchical Approach for Part and Object Segmentation Using Local and Global Aggregation",
                  "Yunfei Xie, Cihang Xie, Alan Yuille, Jieru Mei",
                  "ECCV, 2024",
                  "https://arxiv.org/abs/2409.01353",
                  "@inproceedings{xie2024pixels,<br>" +
                  "&nbsp;&nbsp;&nbsp;title     = {From Pixels to Objects: A Hierarchical Approach for Part and Object Segmentation Using Local and Global Aggregation},<br>" +
                  "&nbsp;&nbsp;&nbsp;author    = {Xie, Yunfei and Xie, Cihang and Yuille, Alan and Mei, Jieru},<br>" +
                  "&nbsp;&nbsp;&nbsp;booktitle = {ECCV},<br>" +
                  "&nbsp;&nbsp;&nbsp;year      = {2024}<br>}",
                  "We present a hierarchical transformer-based model for image segmentation that effectively links the segmentation of detailed parts to the broader context of object segmentation. Central to our approach is a multi-level representation, progressing from individual pixels to superpixels, and finally to cohesive groups. This progression is characterized by two key aggregation strategies: local aggregation for forming superpixels and global aggregation for clustering these superpixels into group tokens. The formation of superpixels through local aggregation taps into the redundancy of image data, yielding segments that align with image parts under object-level supervision. Conversely, the global aggregation process assembles these superpixels into groups that demonstrate a tendency to align with whole objects, especially when guided by part-level supervision. This methodology achieves an optimal balance between adaptability to different types of supervision and computational efficiency, leading to notable advancements in the segmentation of both parts and objects. When evaluated on the PartImageNet dataset, our approach surpasses the previous state-of-the-art by 2.8% and 0.8% in part and object mIoU scores, respectively. Similarly, on the Pascal Part dataset, it demonstrates improvements of 1.5% and 2.0% for part and object mIoU, respectively.",
                  "https://arxiv.org/abs/2409.01353",
                  )


        add_paper("Tackling Data Heterogeneity in Federated Learning via Loss Decomposition",
                    "Shuang Zeng, Pengxin Guo, Shuai Wang, Jianbo Wang, Yuyin Zhou, Liangqiong Qu",
                    "MICCAI, 2024",
                    "https://arxiv.org/abs/2408.12300",
                    "@inproceedings{zeng2024trackling,<br>" +
                     "&nbsp;&nbsp;&nbsp;title     = {Tackling Data Heterogeneity in Federated Learning via Loss Decomposition},<br>" +
                     "&nbsp;&nbsp;&nbsp;author    = {Zeng, Shuang and Guo, Pengxin and Wang, Shuai and Wang, Jianbo and Zhou, Yuyin and Qu, Liangqiong},<br>" +
                     "&nbsp;&nbsp;&nbsp;booktitle = {MICCAI},<br>" +
                     "&nbsp;&nbsp;&nbsp;year      = {2024}<br>}",
                    "Federated Learning (FL) is a rising approach towards collaborative and privacy-preserving machine learning where large-scale medical datasets remain localized to each client. However, the issue of data heterogeneity among clients often compels local models to diverge, leading to suboptimal global models. To mitigate the impact of data heterogeneity on FL performance, we start with analyzing how FL training influence FL performance by decomposing the global loss into three terms: local loss, distribution shift loss and aggregation loss. Remarkably, our loss decomposition reveals that existing local training-based FL methods attempt to further reduce the distribution shift loss, while the global aggregation-based FL methods propose better aggregation strategies to reduce the aggregation loss. Nevertheless, a comprehensive joint effort to minimize all three terms is currently limited in the literature, leading to subpar performance when dealing with data heterogeneity challenges. To fill this gap, we propose a novel FL method based on global loss decomposition, called FedLD, to jointly reduce these three loss terms. Our FedLD involves a margin control regularization in local training to reduce the distribution shift loss, and a principal gradient-based server aggregation strategy to reduce the aggregation loss. Notably, under different levels of data heterogeneity, our strategies achieve better and more robust performance on retinal and chest X-ray classification compared to other FL algorithms.",
                    "https://arxiv.org/abs/2408.12300"
                    )


        add_paper("Rejuvenating image-GPT as Strong Visual Representation Learners",
                  "Sucheng Ren, Zeyu Wang, Hongru Zhu, Junfei Xiao, Alan Yuille, Cihang Xie",
                  "ICML, 2024",
                  "https://arxiv.org/abs/2312.02147",
                  "@inproceedings{ren2024digpt,<br>" +
                  "&nbsp;&nbsp;&nbsp;title     = {Rejuvenating i-GPT for Scalable Visual Representation Learning},<br>" +
                  "&nbsp;&nbsp;&nbsp;author    = {Ren, Sucheng and Wang, Zeyu and Zhu, Hongru and Xiao, Junfei and Yuille, Alan and Xie, Cihang},<br>" +
                  "&nbsp;&nbsp;&nbsp;booktitle = {ICML},<br>" +
                  "&nbsp;&nbsp;&nbsp;year      = {2024}<br>}",
                  "This paper enhances iGPT, one of the pioneering works that introduce autoregressive pretraining to predict the next pixels for visual representation learning. Two simple yet essential changes are made. First, we shift the prediction target from raw pixels to semantic tokens, enabling a higher-level understanding of visual content. Second, we supplement the autoregressive modeling by instructing the model to predict not only the next tokens but also the visible tokens. This pipeline is particularly effective when semantic tokens are encoded by discriminatively trained models, such as CLIP. We introduce this novel approach as D-iGPT. Extensive experiments showcase that D-iGPT excels as a strong learner of visual representations: A notable achievement is its compelling performance on the ImageNet-1K dataset --- by training on publicly available datasets, D-iGPT unprecedentedly achieves 90.0% top-1 accuracy with a vanilla ViT-H. Additionally, D-iGPT shows strong generalization on the downstream task. Code is included in the supplementary materials. Code is avaiable at https://github.com/OliverRensu/D-iGPT.",
                  "https://arxiv.org/abs/2312.02147",
                  "https://github.com/OliverRensu/D-iGPT"
                  )


        add_paper("Learning to Bootstrap for Combating Label Noise",
                    "Yuyin Zhou, Xianhang Li, Fengze Liu, Xuxi Chen, Lequan Yu, Cihang Xie , Matthew P. Lungren, Lei Xing",
                    "CVPR, 2024",
                    "https://arxiv.org/abs/2202.04291",
                    "@inproceedings{zhou2022L2B,<br>" +
                     "&nbsp;&nbsp;&nbsp;title       = {Learning to Bootstrap for Combating Label Noise},<br>" +
                     "&nbsp;&nbsp;&nbsp;author      = {Zhou, Yuyin and Li, Xianhang and Liu, Fengze and Chen, Xuxi and Yu, Lequan and Xie, Cihang and Lungren, Matthew P and Xing, Lei},<br>" +
                     "&nbsp;&nbsp;&nbsp;booktitle   = {CVPR},<br>" +
                     "&nbsp;&nbsp;&nbsp;year        = {2024}<br>}",
                    "Deep neural networks are powerful tools for representation learning, but can easily overfit to noisy labels which are prevalent in many realworld scenarios. Generally, noisy supervision could stem from variation among labelers, label corruption by adversaries, etc. To combat such label noises, one popular line of approach is to apply customized weights to the training instances, so that the corrupted examples contribute less to the model learning. However, such learning mechanisms potentially erase important information about the data distribution and therefore yield suboptimal results. To leverage useful information from the corrupted instances, an alternative is the bootstrapping loss, which reconstructs new training targets on-the-fly by incorporating the network’s own predictions (i.e., pseudo-labels). In this paper, we propose a more generic learnable loss objective which enables a joint reweighting of instances and labels at once. Specifically, our method dynamically adjusts the per-sample importance weight between the real observed labels and pseudo-labels, where the weights are efficiently determined in a meta process. Compared to the previous instance reweighting methods, our approach concurrently conducts implicit relabeling, and thereby yield substantial improvements with almost no extra cost. Extensive experimental results demonstrated the strengths of our approach over existing methods on multiple natural and medical image benchmark datasets, including CIFAR-10, CIFAR-100, ISIC2019 and Clothing 1M.",
                    "https://arxiv.org/abs/2202.04291",
                    "https://github.com/yuyinzhou/L2B"
                )


        add_paper("MixCon3D: Synergizing Multi-View and Cross-Modal Contrastive Learning for Enhancing 3D Representation",
                  "Yipeng Gao, Zeyu Wang, Wei-Shi Zheng, Cihang Xie, Yuyin Zhou",
                  "CVPR, 2024",
                  "https://arxiv.org/abs/2311.01734",
                  "@inproceedings{gao2023mixcon3d,<br>" +
                  "&nbsp;&nbsp;&nbsp;title       = {MixCon3D: Synergizing Multi-View and Cross-Modal Contrastive Learning for Enhancing 3D Representation},<br>" +
                  "&nbsp;&nbsp;&nbsp;author      = {Gao, Yipeng and Wang, Zeyu and Zheng, Wei-Shi and Xie, Cihang and Zhou, Yuyin},<br>" +
                  "&nbsp;&nbsp;&nbsp;booktitle   = {CVPR},<br>" +
                  "&nbsp;&nbsp;&nbsp;year        = {2024}<br>}",
                  "Contrastive learning has emerged as a promising paradigm for 3D open-world understanding, jointly with text, image, and point cloud. In this paper, we introduce MixCon3D, which combines the complementary information between 2D images and 3D point clouds to enhance contrastive learning. With the further integration of multi-view 2D images, MixCon3D enhances the traditional tri-modal representation by offering a more accurate and comprehensive depiction of real-world 3D objects and bolstering text alignment. Additionally, we pioneer the first thorough investigation of various training recipes for the 3D contrastive learning paradigm, building a solid baseline with improved performance. Extensive experiments conducted on three representative benchmarks reveal that our method renders significant improvement over the baseline, surpassing the previous state-of-the-art performance on the challenging 1,156-category Objaverse-LVIS dataset by 5.7%. We further showcase the effectiveness of our approach in more applications, including text-to-3D retrieval and point cloud captioning. The code is available at https://github.com/UCSC-VLAA/MixCon3D.",
                  "https://arxiv.org/abs/2311.01734",
                  "https://github.com/UCSC-VLAA/MixCon3D"
                  )

        add_paper("MicroDiffusion: Implicit Representation-Guided Diffusion for 3D Reconstruction from Limited 2D Microscopy Projections",
                  "Mude Hui, Zihao Wei, Hongru Zhu, Fei Xia, Yuyin Zhou",
                  "CVPR, 2024",
                  "https://arxiv.org/abs/2403.10815",
                  "@inproceedings{hui2024microdiffusion,<br>" +
                  "&nbsp;&nbsp;&nbsp;title       = {MicroDiffusion: Implicit Representation-Guided Diffusion for 3D Reconstruction from Limited 2D Microscopy Projections},<br>" +
                  "&nbsp;&nbsp;&nbsp;author      = {Hui, Mude and Wei, Zihao and Zhu, Hongru and Xia, Fei and Zhou, Yuyin},<br>" +
                  "&nbsp;&nbsp;&nbsp;booktitle   = {CVPR},<br>" +
                  "&nbsp;&nbsp;&nbsp;year        = {2024}<br>}",
                  " Volumetric optical microscopy using non-diffracting beams enables rapid imaging of 3D volumes by projecting them axially to 2D images but lacks crucial depth information. Addressing this, we introduce MicroDiffusion, a pioneering tool facilitating high-quality, depth-resolved 3D volume reconstruction from limited 2D projections. While existing Implicit Neural Representation (INR) models often yield incomplete outputs and Denoising Diffusion Probabilistic Models (DDPM) excel at capturing details, our method integrates INR's structural coherence with DDPM's fine-detail enhancement capabilities. We pretrain an INR model to transform 2D axially-projected images into a preliminary 3D volume. This pretrained INR acts as a global prior guiding DDPM's generative process through a linear interpolation between INR outputs and noise inputs. This strategy enriches the diffusion process with structured 3D information, enhancing detail and reducing noise in localized 2D images. By conditioning the diffusion model on the closest 2D projection, MicroDiffusion substantially enhances fidelity in resulting 3D reconstructions, surpassing INR and standard DDPM outputs with unparalleled image quality and structural fidelity. Our code and dataset are available at https://github.com/UCSC-VLAA/MicroDiffusion.",
                  "https://arxiv.org/abs/2403.10815",
                  "https://github.com/UCSC-VLAA/MicroDiffusion"
                  )

        add_paper("Unleashing the Potential of SAM for Medical Adaptation via Hierarchical Decoding",
                  "Zhiheng Cheng, Qingyue Wei, Hongru Zhu, Yan Wang, Liangqiong Qu, Wei Shao, Yuyin Zhou",
                  "CVPR, 2024",
                  "https://arxiv.org/abs/2403.18271",
                  "@inproceedings{cheng2024unleashing,<br>" +
                  "&nbsp;&nbsp;&nbsp;title       = {Unleashing the Potential of SAM for Medical Adaptation via Hierarchical Decoding},<br>" +
                  "&nbsp;&nbsp;&nbsp;author      = {Cheng, Zhiheng and Wei, Qingyue and Zhu, Hongru and Wang, Yan and Qu, Liangqiong and Shao, Wei and Zhou, Yuyin},<br>" +
                  "&nbsp;&nbsp;&nbsp;booktitle   = {CVPR},<br>" +
                  "&nbsp;&nbsp;&nbsp;year        = {2024}<br>}",
                  "The Segment Anything Model (SAM) has garnered significant attention for its versatile segmentation abilities and intuitive prompt-based interface. However, its application in medical imaging presents challenges, requiring either substantial training cost and extensive medical datasets for full model fine-tuning or high-quality prompts for optimal performance. This paper introduces H-SAM: a prompt-free adaptation of SAM designed for efficient fine-tuning on medical images via a two-stage hierarchical decoding procedure. In the first stage, H-SAM employs SAM's original decoder to create a prior (probabilistic) mask, which will be used to guide more intricate decoding in the second stage. Specifically, we propose two key designs: 1) A class-balanced, mask-guided self-attention mechanism that addresses the unbalanced label distribution and thus enhancing the image embedding; 2) A learnable mask cross-attention mechanism that spatially modulates the interplay among different image regions based on the prior mask. Moreover, the inclusion of a hierarchical pixel decoder in H-SAM enhances its proficiency in capturing fine-grained and localized details. This approach enables SAM to effectively integrate learned medical prior, facilitating enhanced adaptation for medical image segmentation with limited samples. Our H-SAM enjoys 4.78% improvement in average Dice compared to existing prompt-free SAM variants for multi-organ segmentation using only 10% 2D slices. Without using any unlabeled data at all, H-SAM is able to even outperform state-of-the-art semi-supervised models which use extensive unlabeled training data on various medical datasets.",
                  "https://arxiv.org/abs/2403.18271",
                  "https://github.com/Cccccczh404/H-SAM"
                  )


        add_paper("FLHetBench: Benchmarking Device and State Heterogeneity in Federated Learning",
                  "Junyuan Zhang, Shuang Zeng, Miao Zhang, Runxi Wang, Feifei Wang, Yuyin Zhou, Paul Pu Liang, Liangqiong Qu",
                  "CVPR, 2024",
                  "",
                  "@inproceedings{zhang2024flhetbench,<br>" +
                  "&nbsp;&nbsp;&nbsp;title       = {FLHetBench: Benchmarking Device and State Heterogeneity in Federated Learning},<br>" +
                  "&nbsp;&nbsp;&nbsp;author      = {Zhang, Junyuan and Zeng, Shuang and Zhang, Miao and Wang, Runxi and Wang, Feifei and Zhou, Yuyin and Liang, Pu and Qu, Liangqiong},<br>" +
                  "&nbsp;&nbsp;&nbsp;booktitle   = {CVPR},<br>" +
                  "&nbsp;&nbsp;&nbsp;year        = {2024}<br>}",
                  "Federated learning (FL) is a powerful technology that enables collaborative training of machine learning models without sharing private data among clients. The fundamental challenge in FL lies in learning over extremely heterogeneous data distributions, device capacities, and device state availabilities, all of which adversely impact performance and communication efficiency. While data heterogeneity has been well-studied in the literature, this paper introduces FLHetBench, the first FL benchmark targeted toward understanding device and state heterogeneity. FLHetBench comprises two new sampling methods to generate real-world device and state databases with varying heterogeneity and new metrics for quantifying the success of FL methods under these real-world constraints. Using FLHetBench, we conduct a comprehensive evaluation of existing methods and find that they struggle under these settings, which inspires us to propose BiasPrompt+, a new method employing staleness-aware aggregation and fast weights to tackle these new heterogeneity challenges. Experiments on various FL tasks and datasets validate the effectiveness of our BiasPrompt+ method and highlight the value of FLHetBench in fostering the development of more efficient and robust FL solutions under real-world device and state constraints.",
                  "",
                  )


        add_paper("Revisiting Adversarial Training at Scale",
          "Zeyu Wang, Xianhang Li, Hongru Zhu, Cihang Xie",
          "CVPR, 2024",
          "https://arxiv.org/abs/2401.04727",
          "@inproceedings{wang2024advxl,<br>" +
          "&nbsp;&nbsp;&nbsp;title       = {Revisiting Adversarial Training at Scale},<br>" +
          "&nbsp;&nbsp;&nbsp;author      = {Wang, Zeyu and Li, Xianhang and Zhu, Hongru and Xie, Cihang},<br>" +
          "&nbsp;&nbsp;&nbsp;booktitle   = {CVPR},<br>" +
          "&nbsp;&nbsp;&nbsp;year        = {2024}<br>}",
          "The machine learning community has witnessed a drastic change in the training pipeline, pivoted by those ''foundation models'' with unprecedented scales. However, the field of adversarial training is lagging behind, predominantly centered around small model sizes like ResNet-50, and tiny and low-resolution datasets like CIFAR-10. To bridge this transformation gap, this paper provides a modern re-examination with adversarial training, investigating its potential benefits when applied at scale. Additionally, we introduce an efficient and effective training strategy to enable adversarial training with giant models and web-scale data at an affordable computing cost. We denote this newly introduced framework as AdvXL. Empirical results demonstrate that AdvXL establishes new state-of-the-art robust accuracy records under AutoAttack on ImageNet-1K. For example, by training on DataComp-1B dataset, our AdvXL empowers a vanilla ViT-g model to substantially surpass the previous records of  l∞-, l2-, and l1-robust accuracy by margins of 11.4%, 14.2% and 12.9%, respectively. This achievement posits AdvXL as a pioneering approach, charting a new trajectory for the efficient training of robust visual representations at significantly larger scales. Our code is available at https://github.com/UCSC-VLAA/AdvXL.",
          "https://arxiv.org/abs/2401.04727",
          "https://github.com/UCSC-VLAA/AdvXL"
         )

         add_paper("Benchmarking Robustness in Neural Radiance Fields",
           "Chen Wang, Angtian Wang, Junbo Li, Alan Yuille, Cihang Xie",
           "CVPR Adversarial Machine Learning on Computer Vision Workshop, 2024",
           "https://arxiv.org/abs/2301.04075",
           "@inproceedings{wang2023benchmarking,<br>" +
           "&nbsp;&nbsp;&nbsp;title     = {Benchmarking Robustness in Neural Radiance Fields},<br>" +
           "&nbsp;&nbsp;&nbsp;author    = {Wang, Chen and Wang, Angtian and Li, Junbo and Yuille, Alan and Xie, Cihang},<br>" +
           "&nbsp;&nbsp;&nbsp;booktitle = {CVPR Adversarial Machine Learning on Computer Vision Workshop},<br>" +
           "&nbsp;&nbsp;&nbsp;year      = {2024}<br>}",
           "Neural Radiance Field (NeRF) has demonstrated excellent quality in novel view synthesis, thanks to its ability to model 3D object geometries in a concise formulation. However, current approaches to NeRF-based models rely on clean images with accurate camera calibration, which can be difficult to obtain in the real world, where data is often subject to corruption and distortion. In this work, we provide the first comprehensive analysis of the robustness of NeRF-based novel view synthesis algorithms in the presence of different types of corruptions. We find that NeRF-based models are significantly degraded in the presence of corruption, and are more sensitive to a different set of corruptions than image recognition models. Furthermore, we analyze the robustness of the feature encoder in generalizable methods, which synthesize images using neural features extracted via convolutional neural networks or transformers, and find that it only contributes marginally to robustness. Finally, we reveal that standard data augmentation techniques, which can significantly improve the robustness of recognition models, do not help the robustness of NeRF-based models. We hope that our findings will attract more researchers to study the robustness of NeRF-based approaches and help to improve their performance in the real world.",
           "https://arxiv.org/abs/2301.04075",
        )

         add_paper("Masked Autoencoders are Secretly Efficient Learners",
           "Zihao Wei, Chen Wei, Jieru Mei, Yutong Bai, Zeyu Wang, Xianhang Li, Huiyu Wang, Alan Yuille, Yuyin Zhou, Cihang Xie",
           "CVPR Efficient Deep Learning for Computer Vision Workshop, 2024",
           "",
           "@inproceedings{wei2023mae,<br>" +
           "&nbsp;&nbsp;&nbsp;title     = {Masked Autoencoders are Secretly Efficient Learners},<br>" +
           "&nbsp;&nbsp;&nbsp;author    = {Wei, Zihao and Wei, Chen and Mei, Jieru and Bai, Yutong and Wang, Zeyu and Li, Xianhang and Wang, Huiyu and Yuille, Alan and Zhou, Yuyin and Xie, Cihang},<br>" +
           "&nbsp;&nbsp;&nbsp;booktitle = {CVPR Efficient Deep Learning for Computer Vision Workshop},<br>" +
           "&nbsp;&nbsp;&nbsp;year      = {2024}<br>}",
           "This paper provides an efficiency study of training Masked Autoencoders (MAE), a framework introduced by He etal for pre-training Vision Transformers (ViTs). Our results surprisingly reveal that MAE can learn at a faster speed and with fewer training samples while maintaining high performance. To accelerate its training, our changes are simple and straightforward: in the pre-training stage, we aggressively increase the masking ratio, decrease the number of training epochs, and reduce the decoder depth to lower the pre-training cost; in the fine-tuning stage, we demonstrate that layer-wise learning rate decay plays a vital role in unlocking the full potential of pre-trained models. Under this setup, we further verify the sample efficiency of MAE: training MAE is hardly affected even when using only 20% of the original training set. By combining these strategies, we are able to accelerate MAE pre-training by a factor of 82 or more, with little performance drop. For example, we are able to pre-train a ViT-B in ~9 hours using a single NVIDIA A100 GPU and achieve 82.9% top-1 accuracy on the downstream ImageNet classification task. Additionally, we also verify the speed acceleration on another MAE extension, SupMAE.",
           "",
        )



      add_paper("Tuning LayerNorm in Attention: Towards Efficient MultiModal LLM Finetuning",
                  "Bingchen Zhao, Haoqin Tu, Chen Wei, Jieru Mei, Cihang Xie",
                  "ICLR, 2024",
                  "https://arxiv.org/abs/2312.11420",
                  "@inproceedings{zhao2024tuning,<br>" +
                  "&nbsp;&nbsp;&nbsp;title       = {Tuning LayerNorm in Attention: Towards Efficient MultiModal LLM Finetuning},<br>" +
                  "&nbsp;&nbsp;&nbsp;author      = {Zhao, Bingchen and Tu, Haoqin and Wei, Chen and Mei, Jieru and Xie, Cihang},<br>" +
                  "&nbsp;&nbsp;&nbsp;booktitle   = {ICLR},<br>" +
                  "&nbsp;&nbsp;&nbsp;year        = {2024}<br>}",
                  "This paper introduces an efficient strategy to transform Large Language Models (LLMs) into Multi-Modal Large Language Models (MLLMs). By conceptualizing this transformation as a domain adaptation process, i.e., transitioning from text understanding to embracing multiple modalities, we intriguingly note that, within each attention block, tuning LayerNorm suffices to yield strong performance. Moreover, when benchmarked against other tuning approaches like full parameter finetuning or LoRA, its benefits on efficiency are substantial. For example, when compared to LoRA on a 13B model scale, performance can be enhanced by an average of over 20% across five multi-modal tasks, and meanwhile, results in a significant reduction of trainable parameters by 41.9% and a decrease in GPU memory usage by 17.6%. On top of this LayerNorm strategy, we showcase that selectively tuning only with conversational data can improve efficiency further. Beyond these empirical outcomes, we provide a comprehensive analysis to explore the role of LayerNorm in adapting LLMs to the multi-modal domain and improving the expressive power of the model.",
                  "https://arxiv.org/abs/2312.11420",
                  "https://huggingface.co/docs/peft/main/en/package_reference/layernorm_tuning"
                  )


      add_paper("Navigation as the Attacker Wishes? Towards Building Byzantine-Robust Embodied Agents under Federated Learning",
                    "Yunchao Zhang, Zonglin Di, Kaiwen Zhou, Cihang Xie, Xin Wang",
                    "NAACL, 2024",
                    "https://arxiv.org/abs/2211.14769",
                    "@inproceedings{zhang2022navigation,<br>" +
                     "&nbsp;&nbsp;&nbsp;title     = {Navigation as the Attacker Wishes? Towards Building Byzantine-Robust Embodied Agents under Federated Learning},<br>" +
                     "&nbsp;&nbsp;&nbsp;author    = {Zhang, Yunchao and Di, Zonglin and Zhou, Kaiwen and Xie, Cihang and Wang, Xin},<br>" +
                     "&nbsp;&nbsp;&nbsp;booktitle = {NAACL},<br>" +
                     "&nbsp;&nbsp;&nbsp;year      = {2024}<br>}",
                    "Federated embodied agent learning protects the data privacy of individual visual environments by keeping data locally at each client (the individual environment) during training. However, since the local data is inaccessible to the server under federated learning, attackers may easily poison the training data of the local client to build a backdoor in the agent without notice. Deploying such an agent raises the risk of potential harm to humans, as the attackers may easily navigate and control the agent as they wish via the backdoor. Towards Byzantine-robust federated embodied agent learning, in this paper, we study the attack and defense for the task of vision-and-language navigation (VLN), where the agent is required to follow natural language instructions to navigate indoor environments. First, we introduce a simple but effective attack strategy, Navigation as Wish (NAW), in which the malicious client manipulates local trajectory data to implant a backdoor into the global model. Results on two VLN datasets (R2R and RxR) show that NAW can easily navigate the deployed VLN agent regardless of the language instruction, without affecting its performance on normal test sets. Then, we propose a new Prompt-Based Aggregation (PBA) to defend against the NAW attack in federated VLN, which provides the server with a ''prompt'' of the vision-and-language alignment variance between the benign and malicious clients so that they can be distinguished during training. We validate the effectiveness of the PBA method on protecting the global model from the NAW attack, which outperforms other state-of-the-art defense methods by a large margin in the defense metrics on R2R and RxR.",
                    "https://arxiv.org/abs/2211.14769",
                    "https://github.com/eric-ai-lab/Naivgation-as-wish"
                )


            add_paper("Localization vs. Semantics: Visual Representations in Unimodal and Multimodal Models",
                    "Zhuowan Li, Cihang Xie, Benjamin Van Durme, Alan Yuille",
                    "EACL, 2024",
                    "https://arxiv.org/abs/2212.00281",
                    "@inproceedings{li2024localization,<br>" +
                     "&nbsp;&nbsp;&nbsp;title     = {Localization vs. Semantics: Visual Representations in Unimodal and Multimodal Models},<br>" +
                     "&nbsp;&nbsp;&nbsp;author    = {Li, Zhuowan and Xie, Cihang and Van Durme, Benjamin and Yuille, Alan},<br>" +
                     "&nbsp;&nbsp;&nbsp;booktitle = {EACL},<br>" +
                     "&nbsp;&nbsp;&nbsp;year      = {2024}<br>}",
                    "Despite the impressive advancements achieved through vision-and-language pretraining, it remains unclear whether multi-modal learning can help understand each individual modality. In this work, we conduct a comparative analysis of the visual representations in existing vision-and-language models and vision-only models by probing on a broad range of tasks. Five probing tasks are evaluated in order to assess the quality of the learned representations in a nuanced manner. Our results on five probing tasks suggest vision-and-language models are better at label prediction tasks like object and attribute prediction, while vision-only models are stronger at dense prediction tasks that require more localized information. We hope our study sheds light on the role of language in visual learning, and serves as an empirical guide for various pretrained models.",
                    "https://arxiv.org/abs/2212.00281",
                    "https://github.com/Lizw14/visual_probing"
                )

            

            add_paper("Brain Tumor Segmentation Through SuperVoxel Transformer",
                  "Yunfei Xie, Ce Zhou, Jieru Mei, Xianhang Li, Cihang Xie, Yuyin Zhou",
                  "ISBI, 2024",
                  "",
                  "@inproceedings{xie2024brats,<br>" +
                  "&nbsp;&nbsp;&nbsp;title     = {Brain Tumor Segmentation Through SuperVoxel Transformer},<br>" +
                  "&nbsp;&nbsp;&nbsp;author    = {Xie, Yunfei and Zhou, Ce and Mei, Jieru and Li, Xianhang and Xie, Cihang and Zhou, Yuyin},<br>" +
                  "&nbsp;&nbsp;&nbsp;booktitle = {ISBI},<br>" +
                  "&nbsp;&nbsp;&nbsp;year      = {2024}<br>}",
                  "Segmenting brain tumors presents a multifaceted challenge due to their varied appearances and scales. This study endeavors to develop segmentation models with broad applicability across brain tumors, highlighting the importance of models capable of accommodating diverse lesion types, institutions, and demographic characteristics. Specifically, we investigate two CNN-Transformer hybrid models within the BraTS-ISBI 2024 challenge, which seeks to advance generalized segmentation models for brain tumors. The first model is the decoder-only variant of the 3D TransUNet, built upon nnUNet, featuring a CNN encoder and a hybrid CNN-Transformer decoder, which has shown superior performance on the brain metastases segmentation. The Transformer decoder treats segmentation as a binary mask classification task, refining queries through cross-attention. Additionally, we introduce a novel supervoxel Transformer to improve interpretability by efficiently clustering voxels with similar characteristics. This model incorporates a supervoxel cross-attention mechanism to iteratively refine assignments and features. Both models undergo training on an expanded augmented  BraTS-ISBI 2024 challenge dataset and are combined for validation through ensembling techniques. Our best solution achieved 2nd place on the leaderboard during the validation phase.",
                  )


            add_paper("Boosting Dermatoscopic Lesion Segmentation via Diffusion Models with Visual and Textual Prompts",
                  "Shiyi Du, Xiaosong Wang, Yongyi Lu, Yuyin Zhou, Shaoting Zhang, Alan Yuille, Kang Li, Zongwei Zhou",
                  "ISBI, 2024",
                  "https://arxiv.org/abs/2310.02906",
                  "@inproceedings{du2024boosting,<br>" +
                  "&nbsp;&nbsp;&nbsp;title     = {Boosting Dermatoscopic Lesion Segmentation via Diffusion Models with Visual and Textual Prompts},<br>" +
                  "&nbsp;&nbsp;&nbsp;author    = {Du, Shiyi and Wang, Xiaosong and Lu, Yongyi and Zhou, Yuyin and Zhang, Shaoting and Yuille, Alan and Li, Kang and Zhou, Zongwei},<br>" +
                  "&nbsp;&nbsp;&nbsp;booktitle = {ISBI},<br>" +
                  "&nbsp;&nbsp;&nbsp;year      = {2024}<br>}",
                  "Image synthesis approaches, such as generative adversarial networks, have been popular for data augmentation in medical image analysis. These techniques help overcome the shortage of publicly accessible data and quality annotations. However, they often lack control over the contents of generated images, such as disease patterns, lesion locations, and diagnostic attributes. This work adapts the diffusion model, a recent advance in generative models, with lesion-specific visual and textual prompts to generate dermatoscopic images. We demonstrate our diffusion model-based framework's superiority over classical generation models in both image quality and segmentation performance on skin lesions. It achieves a 9% increase in SSIM image quality measure and over 5% improvement in Dice coefficients compared to prior arts.",
                  "https://arxiv.org/abs/2310.02906",
                  )



            add_paper("A Generalist Vision–Language Foundation Model for Diverse Biomedical Tasks",
                  "Kai Zhang, Rong Zhou, Eashan Adhikarla, Zhiling Yan, Yixin Liu, Jun Yu, Zhengliang Liu, Xun Chen, Brian D. Davison, Hui Ren, Jing Huang, Chen Chen, Yuyin Zhou, Sunyang Fu, Wei Liu, Tianming Liu, Xiang Li, Yong Chen, Lifang He, James Zou, Quanzheng Li, Hongfang Liu, Lichao Sun",
                  "Nature Medicine, 2024",
                  "https://arxiv.org/abs/2305.17100",
                  "@article{zhang2023generalist,<br>" +
                  "&nbsp;&nbsp;&nbsp;title = {A generalist vision--language foundation model for diverse biomedical tasks},<br>" +
                  "&nbsp;&nbsp;&nbsp;author = {Zhang, Kai and Zhou, Rong and Adhikarla, Eashan and Yan, Zhiling and Liu, Yixin and Yu, Jun and Liu, Zhengliang and Chen, Xun and Davison, Brian D. and Ren, Hui and Huang, Jing and Chen, Chen and Zhou, Yuyin and Fu, Sunyang and Liu, Wei and Liu, Tianming and Li, Xiang and Chen, Yong and He, Lifang and Zou, James and Li, Quanzheng and Liu, Hongfang and Sun, Lichao},<br>" +
                  "&nbsp;&nbsp;&nbsp;journal = {Nature Medicine},<br>" +
                  "&nbsp;&nbsp;&nbsp;year = {2024},<br>}",
                  "Traditional biomedical artificial intelligence (AI) models, designed for specific tasks or modalities, often exhibit limited flexibility in real-world deployment and struggle to utilize holistic information. Generalist AI holds the potential to address these limitations due to its versatility in interpreting different data types and generating tailored outputs for diverse needs. However, existing biomedical generalist AI solutions are typically heavyweight and closed source to researchers, practitioners and patients. Here, we describe BiomedGPT, the first open-source and lightweight vision–language foundation model, designed as a generalist capable of performing various biomedical tasks. BiomedGPT achieved state-of-the-art results in 16 out of 25 experiments while maintaining a computing-friendly model scale. We also conducted human evaluations to assess the capabilities of BiomedGPT in radiology visual question answering, report generation and summarization. BiomedGPT exhibits robust prediction ability with a low error rate of 3.8% in question answering, satisfactory performance with an error rate of 8.3% in writing complex radiology reports, and competitive summarization ability with a nearly equivalent preference score to human experts. Our method demonstrates that effective training with diverse data can lead to more practical biomedical AI for improving diagnosis and workflow efficiency.",
                  "https://arxiv.org/abs/2305.17100",
                  "https://github.com/taokz/BiomedGPT"
                  )





            add_paper("TransUNet: Rethinking the U-Net architecture design for medical image segmentation through the lens of transformers",
          "Jieneng Chen, Jieru Mei, Xianhang Li, Yongyi Lu, Qihang Yu, Qingyue Wei, Xiangde Luo, Yutong Xie, Ehsan Adeli, Yan Wang, Matthew P Lungren, Shaoting Zhang, Lei Xing, Le Lu, Alan Yuille, Yuyin Zhou",
          "Medical Image Analysis, 2024",
          "https://www.sciencedirect.com/science/article/pii/S1361841524002056",
          "@article{chen2024transunet,<br>" +
          "&nbsp;&nbsp;&nbsp;title   = {TransUNet: Rethinking the U-Net architecture design for medical image segmentation through the lens of transformers},<br>" +
          "&nbsp;&nbsp;&nbsp;author  = {Jieneng Chen, Jieru Mei, Xianhang Li, Yongyi Lu, Qihang Yu, Qingyue Wei, Xiangde Luo, Yutong Xie, Ehsan Adeli, Yan Wang, Matthew P Lungren, Shaoting Zhang, Lei Xing, Le Lu, Alan Yuille, Yuyin Zhou},<br>" +
          "&nbsp;&nbsp;&nbsp;journal = {Medical Image Analysis},<br>" +
          "&nbsp;&nbsp;&nbsp;year    = {2024},<br>",
          "Medical image segmentation is crucial for healthcare, yet convolution-based methods like U-Net face limitations in modeling long-range dependencies. To address this, Transformers designed for sequence-to-sequence predictions have been integrated into medical image segmentation. However, a comprehensive understanding of Transformers’ self-attention in U-Net components is lacking. TransUNet, first introduced in 2021, is widely recognized as one of the first models to integrate Transformer into medical image analysis. In this study, we present the versatile framework of TransUNet that encapsulates Transformers’ self-attention into two key modules: (1) a Transformer encoder tokenizing image patches from a convolution neural network (CNN) feature map, facilitating global context extraction, and (2) a Transformer decoder refining candidate regions through cross-attention between proposals and U-Net features. These modules can be flexibly inserted into the U-Net backbone, resulting in three configurations: Encoder-only, Decoder-only, and Encoder+Decoder. TransUNet provides a library encompassing both 2D and 3D implementations, enabling users to easily tailor the chosen architecture. Our findings highlight the encoder’s efficacy in modeling interactions among multiple abdominal organs and the decoder’s strength in handling small targets like tumors. It excels in diverse medical applications, such as multi-organ segmentation, pancreatic tumor segmentation, and hepatic vessel segmentation. Notably, our TransUNet achieves a significant average Dice improvement of 1.06% and 4.30% for multi-organ segmentation and pancreatic tumor segmentation, respectively, when compared to the highly competitive nn-UNet, and surpasses the top-1 solution in the BrasTS2021 challenge. 2D/3D Code and models are available at https://github.com/Beckschen/TransUNet and https://github.com/Beckschen/TransUNet-3D, respectively.",
          "https://www.sciencedirect.com/science/article/pii/S1361841524002056",
          "https://github.com/Beckschen/TransUNet"
          )


        add_paper("Self-supervised deep learning of gene–gene interactions for improved gene expression recovery",
          "Qingyue Wei, Md Tauhidul Islam, Yuyin Zhou, Lei Xing",
          "Briefings in Bioinformatics, 2024",
          "https://doi.org/10.1093/bib/bbae031",
          "@article{wei2024geneinteractions,<br>" +
           "&nbsp;&nbsp;&nbsp;title     = {Self-supervised deep learning of gene–gene interactions for improved gene expression recovery},<br>" +
           "&nbsp;&nbsp;&nbsp;author    = {Wei, Qingyue and Islam, Md Tauhidul and Zhou, Yuyin and Xing, Lei},<br>" +
           "&nbsp;&nbsp;&nbsp;journal   = {Briefings in Bioinformatics},<br>" +
           "&nbsp;&nbsp;&nbsp;year      = {2024},<br>}",
          "Single-cell RNA sequencing (scRNA-seq) has emerged as a powerful tool to gain biological insights at the cellular level. However, due to technical limitations of the existing sequencing technologies, low gene expression values are often omitted, leading to inaccurate gene counts. Existing methods, including advanced deep learning techniques, struggle to reliably impute gene expressions due to a lack of mechanisms that explicitly consider the underlying biological knowledge of the system. In reality, it has long been recognized that gene–gene interactions may serve as reflective indicators of underlying biology processes, presenting discriminative signatures of the cells. A genomic data analysis framework that is capable of leveraging the underlying gene–gene interactions is thus highly desirable and could allow for more reliable identification of distinctive patterns of the genomic data through extraction and integration of intricate biological characteristics of the genomic data. Here we tackle the problem in two steps to exploit the gene–gene interactions of the system. We first reposition the genes into a 2D grid such that their spatial configuration reflects their interactive relationships. To alleviate the need for labeled ground truth gene expression datasets, a self-supervised 2D convolutional neural network is employed to extract the contextual features of the interactions from the spatially configured genes and impute the omitted values. Extensive experiments with both simulated and experimental scRNA-seq datasets are carried out to demonstrate the superior performance of the proposed strategy against the existing imputation methods.",
          "https://doi.org/10.1093/bib/bbae031"
        )

        add_paper("CIS-UNet: Multi-Class Segmentation of the Aorta in Computed Tomography Angiography via Context-Aware Shifted Window Self-Attention",
          "Muhammad Imran, Jonathan R Krebs, Veera Rajasekhar Reddy Gopu, Brian Fazzone, Vishal Balaji Sivaraman, Amarjeet Kumar, Chelsea Viscardi, Robert Evans Heithaus, Benjamin Shickel, Yuyin Zhou, Michol A Cooper, Wei Shao",
          "Computerized Medical Imaging and Graphics",
          "https://arxiv.org/abs/2401.13049",
          "@article{imran2024cisunet,<br>" +
           "&nbsp;&nbsp;&nbsp;title   = {CIS-UNet: Multi-Class Segmentation of the Aorta in Computed Tomography Angiography via Context-Aware Shifted Window Self-Attention},<br>" +
           "&nbsp;&nbsp;&nbsp;author  = {Imran, Muhammad and Krebs, Jonathan R and Gopu, Veera Rajasekhar Reddy and Fazzone, Brian and Sivaraman, Vishal Balaji and Kumar, Amarjeet and Viscardi, Chelsea and Heithaus, Robert Evans and Shickel, Benjamin and Zhou, Yuyin and Cooper, Michol A and Shao, Wei},<br>" +
           "&nbsp;&nbsp;&nbsp;journal = {Computerized Medical Imaging and Graphics},<br>" +
           "&nbsp;&nbsp;&nbsp;year    = {2024},<br>",
           "Advancements in medical imaging and endovascular grafting have facilitated minimally invasive treatments for aortic diseases. Accurate 3D segmentation of the aorta and its branches is crucial for interventions, as inaccurate segmentation can lead to erroneous surgical planning and endograft construction. Previous methods simplified aortic segmentation as a binary image segmentation problem, overlooking the necessity of distinguishing between individual aortic branches. In this paper, we introduce Context Infused Swin-UNet (CIS-UNet), a deep learning model designed for multi-class segmentation of the aorta and thirteen aortic branches. Combining the strengths of Convolutional Neural Networks (CNNs) and Swin transformers, CIS-UNet adopts a hierarchical encoder-decoder structure comprising a CNN encoder, symmetric decoder, skip connections, and a novel Context-aware Shifted Window Self-Attention (CSW-SA) as the bottleneck block. Notably, CSW-SA introduces a unique utilization of the patch merging layer, distinct from conventional Swin transformers. It efficiently condenses the feature map, providing a global spatial context and enhancing performance when applied at the bottleneck layer, offering superior computational efficiency and segmentation accuracy compared to the Swin transformers. We trained our model on computed tomography (CT) scans from 44 patients and tested it on 15 patients. CIS-UNet outperformed the state-of-the-art SwinUNetR segmentation model, which is solely based on Swin transformers, by achieving a superior mean Dice coefficient of 0.713 compared to 0.697, and a mean surface distance of 2.78 mm compared to 3.39 mm. CIS-UNet's superior 3D aortic segmentation offers improved precision and optimization for planning endovascular treatments. Our dataset and code will be publicly available.",
          "https://arxiv.org/abs/2401.13049",          
        )


       add_paper("Image Registration of In Vivo Micro-Ultrasound and Ex Vivo Pseudo-Whole Mount Histopathology Images of the Prostate: A Proof-of-Concept Study",
                    "Muhammad Imran, Brianna Nguyen, Jake Pensa, Sara M. Falzarano, Anthony E. Sisk, Muxuan Liang, John Michael DiBianco, Li-Ming Su, Yuyin Zhou, Wayne G. Brisbane, Wei Shao",
                    "Biomedical Signal Processing and Control",
                    "https://arxiv.org/abs/2305.19939",
                    "@article{imran2023image,<br>" +
                    "&nbsp;&nbsp;&nbsp;title = {Image Registration of In Vivo Micro-Ultrasound and Ex Vivo Pseudo-Whole Mount Histopathology Images of the Prostate: A Proof-of-Concept Study},<br>" +
                    "&nbsp;&nbsp;&nbsp;author = {Imran, Muhammad and Nguyen, Brianna and Pensa, Jake and Falzarano, Sara M. and Sisk, Anthony E. and Liang, Muxuan and DiBianco, John Michael and Su, Li-Ming and Zhou, Yuyin and Brisbane, Wayne G. and Shao, Wei},<br>" +
                    "&nbsp;&nbsp;&nbsp;journal = {Biomedical Signal Processing and Control}<br>" +
                    "&nbsp;&nbsp;&nbsp;year = {2024},<br>",
                    "Early diagnosis of prostate cancer significantly improves a patient's 5-year survival rate. Biopsy of small prostate cancers is improved with image-guided biopsy. MRI-ultrasound fusion-guided biopsy is sensitive to smaller tumors but is underutilized due to the high cost of MRI and fusion equipment. Micro-ultrasound (micro-US), a novel high-resolution ultrasound technology, provides a cost-effective alternative to MRI while delivering comparable diagnostic accuracy. However, the interpretation of micro-US is challenging due to subtle grayscale changes indicating cancer vs normal tissue. This challenge can be addressed by training urologists with a large dataset of micro-US images containing the ground truth cancer outlines. Such a dataset can be mapped from surgical specimens (histopathology) onto micro-US images via image registration. In this paper, we present a semi-automated pipeline for registering in vivo micro-US images with ex vivo whole-mount histopathology images. Our pipeline begins with the reconstruction of pseudo-whole-mount histopathology images and a 3-dimensional (3D) micro-US volume. Each pseudo-whole-mount histopathology image is then registered with the corresponding axial micro-US slice using a two-stage approach that estimates an affine transformation followed by a deformable transformation. We evaluated our registration pipeline using micro-US and histopathology images from 18 patients who underwent radical prostatectomy. The results showed a Dice coefficient of 0.94 and a landmark error of 2.7 mm, indicating the accuracy of our registration pipeline. This proof-of-concept study demonstrates the feasibility of accurately aligning micro-US and histopathology images. To promote transparency and collaboration in research, we will make our code and dataset publicly available.",
                    "https://arxiv.org/abs/2305.19939"
                    )


        add_paper("MicroSegNet: A Deep Learning Approach for Prostate Segmentation on Micro-Ultrasound Images",
                    "Hongxu Jiang, Muhammad Imran, Preethika Muralidharan, Anjali Patel, Jake Pensa, Muxuan Liang, Tarik Benidir, Joseph R Grajo, Jason P Joseph, Russell Terry, John Michael DiBianco, Li-Ming Su, Yuyin Zhou, Wayne G Brisbane, Wei Shao",
                    "Computerized Medical Imaging and Graphics, 2024",
                    "https://arxiv.org/abs/2305.19956",
                    "@article{jiang2023microsegnet,<br>" +
                    "&nbsp;&nbsp;&nbsp;title = {MicroSegNet: A Deep Learning Approach for Prostate Segmentation on Micro-Ultrasound Images},<br>" +
                    "&nbsp;&nbsp;&nbsp;author = {Jiang, Hongxu and Imran, Muhammad and Muralidharan, Preethika and Patel, Anjali and Pensa, Jake and Liang, Muxuan and Benidir, Tarik and Grajo, Joseph R and Joseph, Jason P and Terry, Russell and DiBianco, John Michael and Su, Li-Ming and Zhou, Yuyin and Brisbane, Wayne G and Shao, Wei},<br>" +
                    "&nbsp;&nbsp;&nbsp;journal = {Computerized Medical Imaging and Graphics}<br>" +
                    "&nbsp;&nbsp;&nbsp;year = {2024},<br>",
                    "Micro-ultrasound (micro-US) is a novel 29-MHz ultrasound technique that provides 3-4 times higher resolution than traditional ultrasound, delivering comparable accuracy for diagnosing prostate cancer to MRI but at a lower cost. Accurate prostate segmentation is crucial for prostate volume measurement, cancer diagnosis, prostate biopsy, and treatment planning. This paper proposes a deep learning approach for automated, fast, and accurate prostate segmentation on micro-US images. Prostate segmentation on micro-US is challenging due to artifacts and indistinct borders between the prostate, bladder, and urethra in the midline. We introduce MicroSegNet, a multi-scale annotation-guided Transformer UNet model to address this challenge. During the training process, MicroSegNet focuses more on regions that are hard to segment (challenging regions), where expert and non-expert annotations show discrepancies. We achieve this by proposing an annotation-guided cross entropy loss that assigns a larger weight to pixels in hard regions and a lower weight to pixels in easy regions. We trained our model using micro-US images from 55 patients, followed by evaluation on 20 patients. Our MicroSegNet model achieved a Dice coefficient of 0.942 and a Hausdorff distance of 2.11 mm, outperforming several state-of-the-art segmentation methods, as well as three human annotators with different experience levels. We will make our code and dataset publicly available to promote transparency and collaboration in research.",
                    "https://arxiv.org/abs/2305.19956"
                    )


        add_paper("Sight Beyond Text: Multi-Modal Training Enhances LLMs in Truthfulness and Ethics",
                    "Haoqin Tu, Bingchen Zhao, Chen Wei, Cihang Xie",
                    "TMLR, 2024",
                    "https://arxiv.org/abs/2309.07120",
                    "@inproceedings{tu2023sight,<br>" +
                    "&nbsp;&nbsp;&nbsp;title = {Sight Beyond Text: Multi-Modal Training Enhances LLMs in Truthfulness and Ethics},<br>" +
                    "&nbsp;&nbsp;&nbsp;author = {Tu, Haoqin and Zhao, Bingchen and Wei, Chen and Xie, Cihang},<br>" +
                    "&nbsp;&nbsp;&nbsp;journal = {TMLR},<br>" +
                    "&nbsp;&nbsp;&nbsp;year    = {2024}<br>}",
                    "Multi-modal large language models (MLLMs) are trained based on large language models (LLM), with an enhanced capability to comprehend multi-modal inputs and generate textual responses. While they excel in multi-modal tasks, the pure NLP abilities of MLLMs are often underestimated and left untested. In this study, we get out of the box and unveil an intriguing characteristic of MLLMs -- our preliminary results suggest that visual instruction tuning, a prevailing strategy for transitioning LLMs into MLLMs, unexpectedly and interestingly helps models attain both improved truthfulness and ethical alignment in the pure NLP context. For example, a visual-instruction-tuned LLaMA2 7B model surpasses the performance of the LLaMA2-chat 7B model, fine-tuned with over one million human annotations, on TruthfulQA-mc and Ethics benchmarks. Further analysis reveals that the improved alignment can be attributed to the superior instruction quality inherent to visual-text data. In releasing our code at https://github.com/UCSC-VLAA/Sight-Beyond-Text, we aspire to foster further exploration into the intrinsic value of visual-text synergies and, in a broader scope, multi-modal interactions in alignment research.",
                    "https://arxiv.org/abs/arXiv:2309.07120",
                    "https://github.com/UCSC-VLAA/Sight-Beyond-Text"
                    )

        add_paper("Unleashing the Power of Visual Prompting At the Pixel Level",
            "Junyang Wu, Xianhang Li, Chen Wei, Huiyu Wang, Alan Yuille, Yuyin Zhou, Cihang Xie",
            "TMLR, 2024",
            "https://arxiv.org/abs/2212.10556",
            "@article{wu2024evp,<br>" +
             "&nbsp;&nbsp;&nbsp;title   = {Unleashing the Power of Visual Prompting At the Pixel Level},<br>" +
             "&nbsp;&nbsp;&nbsp;author  = {Wu, Junyang and Li, Xianhang and Wei, Chen and Wang, Huiyu and Yuille, Alan and Zhou, Yuyin and Xie, Cihang},<br>" +
             "&nbsp;&nbsp;&nbsp;journal = {TMLR},<br>" +
             "&nbsp;&nbsp;&nbsp;year    = {2024}<br>}",
            "This paper presents a simple and effective visual prompting method for adapting pre-trained models to downstream recognition tasks. Our method includes two key designs. First, rather than directly adding together the prompt and the image, we treat the prompt as an extra and independent learnable component. We show that the strategy of reconciling the prompt and the image matters, and find that warping the prompt around a properly shrinked image empirically works the best. Second, we re-introduce two \"old tricks\" commonly used in building transferable adversarial examples, i.e., input diversity and gradient normalization, into visual prompting. These techniques improve optimization and enable the prompt to generalize better. We provide extensive experimental results to demonstrate the effectiveness of our method. Using a CLIP model, our prompting method sets a new record of 82.8% average accuracy across 12 popular classification datasets, substantially surpassing the prior art by +5.6%. It is worth noting that this prompting performance already outperforms linear probing by +2.1% and can even match fully fine-tuning in certain datasets. In addition, our prompting method shows competitive performance across different data scales and against distribution shifts. The code is publicly available at https://github.com/UCSC-VLAA/EVP.",
            "https://arxiv.org/abs/2212.10556",
            "https://github.com/UCSC-VLAA/EVP"
        )


        add_paper("FedConv: Enhancing Convolutional Neural Networks for Handling Data Heterogeneity in Federated Learning",
                  "Peiran Xu, Zeyu Wang, Jieru Mei, Liangqiong Qu, Alan Yuille, Cihang Xie, Yuyin Zhou",
                  "TMLR, 2024",
                  "https://arxiv.org/abs/2310.04412",
                  "@article{xu2024fedconv,<br>" +
                  "&nbsp;&nbsp;&nbsp;title   = {FedConv: Enhancing Convolutional Neural Networks for Handling Data Heterogeneity in Federated Learning},<br>" +
                  "&nbsp;&nbsp;&nbsp;author  = {Xu, Peiran and Wang, Zeyu and Mei, Jieru and Qu, Liangqiong and Yuille, Alan and Xie, Cihang and Zhou, Yuyin},<br>" +
                  "&nbsp;&nbsp;&nbsp;journal = {TMLR},<br>" +
                  "&nbsp;&nbsp;&nbsp;year    = {2024}<br>}",
                  "Federated learning (FL) is an emerging paradigm in machine learning, where a shared model is collaboratively learned using data from multiple devices to mitigate the risk of data leakage. While recent studies suggest that Vision Transformer (ViT) outperforms Convolutional Neural Networks (CNNs) in addressing data heterogeneity in FL, the specific architectural components that contribute to this advantage are not fully understood. This paper systematically investigates the impact of different architectural elements, such as activation functions and normalization layers, on performance in heterogeneous FL. Through empirical analyses, we offer general guidance on micro-architecture design principles for heterogeneous FL. Our findings indicate that with strategic architectural modifications, pure CNNs can achieve robustness comparable to or exceeding that of ViTs in handling heterogeneous data in FL. Our approach is compatible with existing FL techniques and delivers state-of-the-art solutions across various FL benchmarks. The code is available at https://github.com/UCSC-VLAA/FedConv.",
                  "https://arxiv.org/abs/2310.04412",
                  "https://github.com/UCSC-VLAA/FedConv"
                  )



        add_paper("On the Adversarial Robustness of Camera-based 3D Object Detection",
            "Shaoyuan Xie, Zichao Li, Zeyu Wang, Cihang Xie",
            "TMLR, 2024",
            "https://arxiv.org/abs/2301.10766",
            "@article{xie2024robustness,<br>" +
            "&nbsp;&nbsp;&nbsp;title   = {On the Adversarial Robustness of Camera-based 3D Object Detection},<br>" +
            "&nbsp;&nbsp;&nbsp;author  = {Xie, Shaoyuan and Li, Zichao and Wang, Zeyu and Xie, Cihang},<br>" +
            "&nbsp;&nbsp;&nbsp;journal = {TMLR},<br>" +
            "&nbsp;&nbsp;&nbsp;year    = {2024}<br>}",
            "In recent years, camera-based 3D object detection has gained widespread attention for its ability to achieve high performance with low computational cost. However, the robustness of these methods to adversarial attacks has not been thoroughly examined. In this study, we conduct the first comprehensive investigation of the robustness of leading camera-based 3D object detection methods under various adversarial conditions. Our experiments reveal five interesting findings: (a) the use of accurate depth estimation effectively improves robustness; (b) depth-estimation-free approaches do not show superior robustness; (c) bird's-eye-view-based representations exhibit greater robustness against localization attacks; (d) incorporating multi-frame benign inputs can effectively mitigate adversarial attacks; and (e) addressing long-tail problems can enhance robustness. We hope our work can provide guidance for the design of future camera-based object detection modules with improved adversarial robustness.",
            "https://arxiv.org/abs/2301.10766",
            "https://github.com/Daniel-xsy/BEV-Attack"
        )


        add_paper("Scaling (Down) CLIP: A Comprehensive Analysis of Data, Architecture, and Training Strategies",
            "Zichao Li, Cihang Xie, Ekin Dogus Cubuk",
            "TMLR, 2024",
            "https://openreview.net/forum?id=t4nnCi5AO6",
            "@article{li2024scaling,<br>" +
            "&nbsp;&nbsp;&nbsp;title   = {Scaling (Down) CLIP: A Comprehensive Analysis of Data, Architecture, and Training Strategies},<br>" +
            "&nbsp;&nbsp;&nbsp;author  = {Li, Zichao and Xie, Cihang and Cubuk, Ekin Dogus},<br>" +
            "&nbsp;&nbsp;&nbsp;journal = {TMLR},<br>" +
            "&nbsp;&nbsp;&nbsp;year    = {2024}<br>}",
            "This paper investigates the performance of the Contrastive Language-Image Pre-training (CLIP) method when scaled down to limited computation budgets. We explore CLIP from three perspectives: data, architecture, and training strategies. With regards to data, we demonstrate the significance of high-quality training data and show that a smaller dataset of high-quality data can outperform a larger dataset with lower quality. We also examine how model performance varies with different dataset sizes, suggesting that smaller ViT models are better suited for smaller numbers of sample data, while larger models perform better on larger datasets with fixed compute. Additionally, we provide guidance on when to choose a CNN-based architecture or a ViT-based architecture for CLIP training. Moreover, we compare four CLIP training strategies - SLIP, FLIP, CLIP, and CLIP+Data Augmentation - and show that the choice of training strategy depends on the available compute resources. Our analysis reveals that CLIP+Data Augmentation can achieve comparable performance to CLIP using only half of the training data. This work provides practical insights into how to effectively train and deploy CLIP models, making them more accessible and affordable for practical use in various applications.",
            "https://openreview.net/forum?id=t4nnCi5AO6"
        )


        add_paper("A Flexible 2.5D Medical Image Segmentation Approach with In-Slice and Cross-Slice Attention",
          "Amarjeet Kumar, Hongxu Jiang, Muhammad Imran, Cyndi Valdes, Gabriela Leon, Dahyun Kang, Parvathi Nataraj, Yuyin Zhou, Michael D Weiss, Wei Shao",
          "Computers in Biology and Medicine, 2024",
          "https://arxiv.org/abs/2405.00130",
          "@article{kumar2024flexible25d,<br>" +
           "&nbsp;&nbsp;&nbsp;title   = {A Flexible 2.5 D Medical Image Segmentation Approach with In-Slice and Cross-Slice Attention},<br>" +
           "&nbsp;&nbsp;&nbsp;author  = {Kumar, Amarjeet and Jiang, Hongxu and Imran, Muhammad and Valdes, Cyndi and Leon, Gabriela and Kang, Dahyun and Nataraj, Parvathi and Zhou, Yuyin and Weiss, Michael D and Shao, Wei},<br>" +
           "&nbsp;&nbsp;&nbsp;journal = {Computers in Biology and Medicine},<br>" +
          "&nbsp;&nbsp;&nbsp;year     = {2024}<br>}",
          "Deep learning has become the de facto method for medical image segmentation, with 3D segmentation models excelling in capturing complex 3D structures and 2D models offering high computational efficiency. However, segmenting 2.5D images, which have high in-plane but low through-plane resolution, is a relatively unexplored challenge. While applying 2D models to individual slices of a 2.5D image is feasible, it fails to capture the spatial relationships between slices. On the other hand, 3D models face challenges such as resolution inconsistencies in 2.5D images, along with computational complexity and susceptibility to overfitting when trained with limited data. In this context, 2.5D models, which capture inter-slice correlations using only 2D neural networks, emerge as a promising solution due to their reduced computational demand and simplicity in implementation. In this paper, we introduce CSA-Net, a flexible 2.5D segmentation model capable of processing 2.5D images with an arbitrary number of slices through an innovative Cross-Slice Attention (CSA) module. This module uses the cross-slice attention mechanism to effectively capture 3D spatial information by learning long-range dependencies between the center slice (for segmentation) and its neighboring slices. Moreover, CSA-Net utilizes the self-attention mechanism to understand correlations among pixels within the center slice. We evaluated CSA-Net on three 2.5D segmentation tasks: (1) multi-class brain MRI segmentation, (2) binary prostate MRI segmentation, and (3) multi-class prostate MRI segmentation. CSA-Net outperformed leading 2D and 2.5D segmentation methods across all three tasks, demonstrating its efficacy and superiority. Our code is publicly available at https://github.com/mirthAI/CSA-Net.",
          "https://arxiv.org/abs/2405.00130",
          "https://github.com/mirthAI/CSA-Net"
        )




        document.write("</ul><br>")
        document.write("<h1>2023</h1>")
        document.write("<ul>")


        add_paper("An Inverse Scaling Law for CLIP Training",
                    "Xianhang Li, Zeyu Wang, Cihang Xie",
                    "NeurIPS, 2023",
                    "https://arxiv.org/abs/2305.07017",
                    "@inproceedings{li2023inverse,<br>" +
                    "&nbsp;&nbsp;&nbsp;title     = {An Inverse Scaling Law for CLIP Training},<br>" +
                    "&nbsp;&nbsp;&nbsp;author    = {Li, Xianhang and Wang, Zeyu and Xie, Cihang},<br>" +
                    "&nbsp;&nbsp;&nbsp;booktitle = {NeurIPS},<br>" +
                    "&nbsp;&nbsp;&nbsp;year      = {2023}<br>}",
                    "CLIP, the first foundation model that connects images and text, has enabled many recent breakthroughs in computer vision. However, its associated training cost is prohibitively high, imposing a significant barrier to its widespread exploration. In this paper, we present a surprising finding that there exists an inverse scaling law for CLIP training, whereby the larger the image/text encoders used, the shorter the sequence length of image/text tokens that can be applied in training. Moreover, we showcase that the strategy for reducing image/text token length plays a crucial role in determining the quality of this scaling law.<br>" +
                    "As a result of this finding, we are able to successfully train CLIP even by using academic resources. For example, on an A100 eight-GPU server, our CLIP models achieve zero-shot top-1 ImageNet accuracies of 63.2% in approximately 2 days, 67.8% in approximately 3 days, and 69.3% in approximately 4 days. By reducing the computation barrier associated with CLIP, we hope to inspire more research in this field, particularly from academics. Our code is available at https://github.com/UCSC-VLAA/CLIPA.",
                    "https://arxiv.org/abs/2305.07017",
                    "https://github.com/UCSC-VLAA/CLIPA"
                    )

        add_paper("CLIPA-v2: Scaling CLIP Training with 81.1% Zero-shot ImageNet Accuracy within a $10,000 Budget; An Extra $4,000 Unlocks 81.8% Accuracy",
                    "Xianhang Li, Zeyu Wang, Cihang Xie",
                    "NeurIPS R0-FoMo Workshop, 2023",
                    "https://arxiv.org/abs/2306.15658",
                    "@inproceedings{li2023clipav2,<br>" +
                    "&nbsp;&nbsp;&nbsp;title = {CLIPA-v2: Scaling CLIP Training with 81.1% Zero-shot ImageNet Accuracy within a $10,000 Budget; An Extra $4,000 Unlocks 81.8% Accuracy},<br>" +
                    "&nbsp;&nbsp;&nbsp;author = {Li, Xianhang and Wang, Zeyu and Xie, Cihang},<br>" +
                    "&nbsp;&nbsp;&nbsp;booktitle = {NeurIPS R0-FoMo Workshop},<br>" +
                    "&nbsp;&nbsp;&nbsp;year = {2023}<br>}",
                    "The recent work CLIPA presents an inverse scaling law for CLIP training -- whereby the larger the image/text encoders used, the shorter the sequence length of image/text tokens that can be applied in training. This finding enables us to train high-performance CLIP models with significantly reduced computations. Building upon this work, we hereby present CLIPA-v2 with two key contributions. Technically, we find this inverse scaling law is also applicable in the finetuning stage, enabling further reduction in computational needs. Empirically, we explore CLIPA at scale, extending the experiments up to the H/14 model with ~13B image-text pairs seen during training. Our results are exciting -- by only allocating a budget of $10,000, our CLIP model achieves an impressive zero-shot ImageNet accuracy of 81.1%, surpassing the prior best CLIP model (from OpenCLIP, 80.1%) by 1.0% and meanwhile reducing the computational cost by ~39X. Moreover, with an additional investment of $4,000, we can further elevate the zero-shot ImageNet accuracy to 81.8%. Our code and models are available at https://github.com/UCSC-VLAA/CLIPA.",
                    "https://arxiv.org/abs/arXiv:2306.15658",
                    "https://github.com/UCSC-VLAA/CLIPA"
                    )




        add_paper("SMAUG: Sparse Masked Autoencoder for Efficient Video-Language Pre-training",
                    "Yuanze Lin, Chen Wei, Huiyu Wang, Alan Yuille, Cihang Xie",
                    "ICCV, 2023",
                    "https://arxiv.org/abs/2211.11446",
                    "@inproceedings{lin2023smaug,<br>" +
                     "&nbsp;&nbsp;&nbsp;title     = {SMAUG: Sparse Masked Autoencoder for Efficient Video-Language Pre-training},<br>" +
                     "&nbsp;&nbsp;&nbsp;author    = {Lin, Yuanze and Wei, Chen and Wang, Huiyu and Yuille, Alan and Xie, Cihang},<br>" +
                     "&nbsp;&nbsp;&nbsp;booktitle = {ICCV},<br>" +
                     "&nbsp;&nbsp;&nbsp;year      = {2023}<br>}",
                    "Video-language pre-training is crucial for learning powerful multi-modal representation. However, it typically requires a massive amount of computation. In this paper, we develop SMAUG, an efficient pre-training framework for video-language models. The foundation component in SMAUG is masked autoencoders. Different from prior works which only mask textual inputs, our masking strategy considers both visual and textual modalities, providing a better cross-modal alignment and saving more pre-training costs. On top of that, we introduce a space-time token sparsification module, which leverages context information to further select only 'important' spatial regions and temporal frames for pre-training. Coupling all these designs allows our method to enjoy both competitive performances on text-to-video retrieval and video question answering tasks, and much less pre-training costs by 1.9X or more. For example, our SMAUG only needs about 50 NVIDIA A6000 GPU hours for pre-training to attain competitive performances on these two video-language tasks across six popular benchmarks.",
                    "https://arxiv.org/abs/2211.11446"
                )

        add_paper("Diffusion Models as Masked Autoencoders",
                    "Chen Wei, Karttikeya Mangalam, Po-Yao Huang, Yanghao Li, Haoqi Fan, Hu Xu, Huiyu Wang, Cihang Xie, Alan Yuille, Christoph Feichtenhofer",
                    "ICCV, 2023",
                    "https://arxiv.org/abs/2304.03283",
                    "@inproceedings{wei2023diffusion,<br>" +
                    "&nbsp;&nbsp;&nbsp;title     = {Diffusion Models as Masked Autoencoders},<br>" +
                    "&nbsp;&nbsp;&nbsp;author    = {Wei, Chen and Mangalam, Karttikeya and Huang, Po-Yao and Li, Yanghao and Fan, Haoqi and Xu, Hu and Wang, Huiyu and Xie, Cihang and Yuille, Alan and Feichtenhofer, Christoph},<br>" +
                    "&nbsp;&nbsp;&nbsp;booktitle = {ICCV},<br>" +
                    "&nbsp;&nbsp;&nbsp;year      = {2023}<br>}",
                    "There has been a longstanding belief that generation can facilitate a true understanding of visual data. In line with this, we revisit generatively pre-training visual representations in light of recent interest in denoising diffusion models. While directly pre-training with diffusion models does not produce strong representations, we condition diffusion models on masked input and formulate diffusion models as masked autoencoders (DiffMAE). Our approach is capable of (i) serving as a strong initialization for downstream recognition tasks, (ii) conducting high-quality image inpainting, and (iii) being effortlessly extended to video where it produces state-of-the-art classification accuracy. We further perform a comprehensive study on the pros and cons of design choices and build connections between diffusion models and masked autoencoders.",
                    "https://arxiv.org/abs/2304.03283",
                    "https://weichen582.github.io/diffmae.html"
                    )

        add_paper("DistillBEV: Boosting Multi-Camera 3D Object Detection with Cross-Modal Knowledge Distillation",
                    "Zeyu Wang, Dingwen Li, Chenxu Luo, Cihang Xie, Xiaodong Yang",
                    "ICCV, 2023",
                    "https://arxiv.org/abs/2309.15109",
                    "@inproceedings{wang2023bevdistill,<br>" +
                    "&nbsp;&nbsp;&nbsp;title = {DistillBEV: Boosting Multi-Camera 3D Object Detection with Cross-Modal Knowledge Distillation},<br>" +
                    "&nbsp;&nbsp;&nbsp;author = {Wang, Zeyu and Li, Dingwen and Luo, Chenxu and Xie, Cihang and Yang, Xiaodong},<br>" +
                    "&nbsp;&nbsp;&nbsp;booktitle = {ICCV},<br>" +
                    "&nbsp;&nbsp;&nbsp;year      = {2023}<br>}",
                    "3D perception based on the representations learned from multi-camera bird’s-eye-view (BEV) is trending as cameras are cost-effective for mass production in the autonomous driving industry. However, there exists a distinct performance gap between multi-camera BEV and LiDAR-based 3D object detection. One key reason is that LiDAR captures accurate depth and other geometry measurements, while it is notoriously challenging to infer such 3D information from merely image input. In this work, we propose to boost the representation learning of a multi-camera BEV-based student detector by training it to imitate the features of a well-trained LiDAR-based teacher detector. We propose an effective balancing strategy to enforce the student to focus on learning the crucial features from the teacher and generalize knowledge transfer to multi-scale layers with temporal fusion. We conduct extensive evaluations on multiple representative models of multi-camera BEV. Experiments reveal that our approach renders significant improvement over the student models, leading to state-of-the-art performance on the popular benchmark nuScenes.",
                    "https://arxiv.org/abs/2309.15109"
                    )

        add_paper("Consistency-guided Meta-Learning for Bootstrapping Semi-Supervised Medical Image Segmentation",
                    "Qingyue Wei, Lequan Yu, Xianhang Li, Wei Shao, Cihang Xie, Lei Xing, Yuyin Zhou",
                    "MICCAI, 2023",
                    "https://arxiv.org/abs/2307.11604",
                    "@inproceedings{wei2023mlbseg,<br>" +
                     "&nbsp;&nbsp;&nbsp;title     = {Consistency-guided Meta-Learning for Bootstrapping Semi-Supervised Medical Image Segmentation},<br>" +
                     "&nbsp;&nbsp;&nbsp;author    = {Wei, Qingyue and Yu, Lequan and Li, Xianhang and Shao, Wei and Xie, Cihang and Xing, Lei and Zhou, Yuyin},<br>" +
                     "&nbsp;&nbsp;&nbsp;booktitle = {MICCAI},<br>" +
                     "&nbsp;&nbsp;&nbsp;year      = {2023}<br>}",
                    "Medical imaging has witnessed remarkable progress but usually requires a large amount of high-quality annotated data which is time-consuming and costly to obtain. To alleviate this burden, semi-supervised learning has garnered attention as a potential solution. In this paper, we present Meta-Learning for Bootstrapping Medical Image Segmentation (MLB-Seg), a novel method for tackling the challenge of semi-supervised medical image segmentation. Specifically, our approach first involves training a segmentation model on a small set of clean labeled images to generate initial labels for unlabeled data. To further optimize this bootstrapping process, we introduce a per-pixel weight mapping system that dynamically assigns weights to both the initialized labels and the model’s own predictions. These weights are determined using a meta-process that prioritizes pixels with loss gradient directions closer to those of clean data, which is based on a small set of precisely annotated images. To facilitate the meta-learning process, we additionally introduce a consistency-based Pseudo Label Enhancement (PLE) scheme that improves the quality of the model’s own predictions by ensembling predictions from various augmented versions of the same input. In order to improve the quality of the weight maps obtained through multiple augmentations of a single input, we introduce a mean teacher into the PLE scheme. This method helps to reduce noise in the weight maps and stabilize its generation process. Our extensive experimental results on public atrial and prostate segmentation datasets demonstrate that our proposed method achieves state-of-the-art results under semi-supervision.",
                    "https://arxiv.org/abs/2307.11604",
                    "https://github.com/aijinrjinr/MLB-Seg"
                    )


        add_paper("SwinMM: Masked Multi-view with Swin Transformers for 3D Medical Image Segmentation",
                    "Yiqing Wang, Zihan Li, Zihao Wei, Jieru Mei, Li Liu, Chen Wang, Shengtian Sang, Alan Yuille, Cihang Xie, Yuyin Zhou",
                    "MICCAI, 2023",
                    "https://arxiv.org/abs/2307.12591",
                    "@inproceedings{wang2023swinmm,<br>" +
                     "&nbsp;&nbsp;&nbsp;title     = {SwinMM: Masked Multi-view with Swin Transformers for 3D Medical Image Segmentation},<br>" +
                     "&nbsp;&nbsp;&nbsp;author    = {Wang, Yiqing and Li, Zihan and Wei, Zihao and Mei, Jieru and Liu, Li and Wang, Chen and Sang, Shengtian and Yuille, Alan and Xie, Cihang and Zhou, Yuyin},<br>" +
                     "&nbsp;&nbsp;&nbsp;booktitle = {MICCAI},<br>" +
                     "&nbsp;&nbsp;&nbsp;year      = {2023}<br>}",
                    "Recent advancements in large-scale Vision Transformers have led to significant improvements in pre-trained models for medical image segmentation. However, these methods require a large amount of pre-training data, which can be particularly challenging to obtain in the medical field. To overcome this limitation, we present a novel and data-efficient approach called Masked Multi-view with Swin Transformers, dubbed SwinMM, which is the first comprehensive multi-view pipeline for self-supervised medical image analysis. Our approach consists of two key components. In the pre-training phase, we introduce a masked multi-view encoder that simultaneously trains masked multi-view observations with a diverse set of proxy tasks. These tasks include image reconstruction, rotation, contrastive learning, and a mutual learning paradigm that comprehensively leverages hidden multi-view information from 3D medical data by maximizing the consistency between predictions from different views. In the fine-tuning stage, a cross-view decoder is developed to aggregate the multi-view information using a novel cross-view attention block. SwinMM yields competitive performance, superior robustness, and higher data efficiency compared to recent state-of-the-art models with similar or even more training costs.",
                    "https://arxiv.org/abs/2307.12591",
                    "https://github.com/UCSC-VLAA/SwinMM/"
                )


        add_paper("3D-TransUNet for Brain Metastases Segmentation in the BraTS2023 Challenge",
          "Siwei Yang, Xianhang Li, Jieru Mei, Jieneng Chen, Cihang Xie, Yuyin Zhou",
          "MICCAI Workshop on Brain Lesion, 2023",
          "https://arxiv.org/abs/2403.15735",
          "@inproceedings{yang2024brainles,<br>" +
           "&nbsp;&nbsp;&nbsp;title     = {3D-TransUNet for Brain Metastases Segmentation in the BraTS2023 Challenge},<br>" +
           "&nbsp;&nbsp;&nbsp;author    = {Yang, Siwei and Li, Xianhang and Mei, Jieru and Chen, Jieneng and Xie, Cihang and Zhou, Yuyin},<br>" +
           "&nbsp;&nbsp;&nbsp;booktitle = {MICCAI Workshop on Brain Lesion},<br>" +
           "&nbsp;&nbsp;&nbsp;year      = {2023}<br>}",
          "Segmenting brain tumors is complex due to their diverse appearances and scales. Brain metastases, the most common type of brain tumor, are a frequent complication of cancer. Therefore, an effective segmentation model for brain metastases must adeptly capture local intricacies to delineate small tumor regions while also integrating global context to understand broader scan features. The TransUNet model, which combines Transformer self-attention with U-Net's localized information, emerges as a promising solution for this task. In this report, we address brain metastases segmentation by training the 3D-TransUNet model on the Brain Tumor Segmentation (BraTS-METS) 2023 challenge dataset. Specifically, we explored two architectural configurations: the Encoder-only 3D-TransUNet, employing Transformers solely in the encoder, and the Decoder-only 3D-TransUNet, utilizing Transformers exclusively in the decoder. For Encoder-only 3D-TransUNet, we note that Masked-Autoencoder pre-training is required for a better initialization of the Transformer Encoder and thus accelerates the training process. We identify that the Decoder-only 3D-TransUNet model should offer enhanced efficacy in the segmentation of brain metastases, as indicated by our 5-fold cross-validation on the training set. However, our use of the Encoder-only 3D-TransUNet model already yield notable results, with an average lesion-wise Dice score of 59.8% on the test set, securing second place in the BraTS-METS 2023 challenge.",
           "https://arxiv.org/abs/2403.15735",
        )

        add_paper("Patient-Specific Auto-segmentation on Daily kVCT Images for Adaptive Radiation Therapy",
                    "Yizheng Chen, Michael F. Gensheimer, Hilary P. Bagshaw, Santino Butler, Lequan Yu, Yuyin Zhou, Liyue Shen, Nataliya Kovalchuk, Murat Surucu, Daniel T. Chang, Lei Xing, Bin Han",
                    "International Journal of Radiation Oncology Biology Physics, 2023",
                    "2023",
                    "@article{CHEN2023,<br>" +
                    "&nbsp;&nbsp;&nbsp;title = {Patient-Specific Auto-segmentation on Daily kVCT Images for Adaptive Radiation Therapy},<br>" +
                     "&nbsp;&nbsp;&nbsp;author = {Chen, Yizheng and Gensheimer, Michael F. and Bagshaw, Hilary P. and Butler, Santino and Yu, Lequan and Zhou, Yuyin and Shen, Liyue and Kovalchuk, Nataliya and Surucu, Murat and Chang, Daniel T. and Xing, Lei and Han, Bin},<br>" +
                    "&nbsp;&nbsp;&nbsp;journal = {International Journal of Radiation Oncology Biology Physics},<br>" +
                    "&nbsp;&nbsp;&nbsp;year = {2023},<br>",
                    "   abstract= {Purpose<br>" +
                    "This study explored deep-learning-based patient-specific auto-segmentation using transfer learning on daily RefleXion kilovoltage computed tomography (kVCT) images to facilitate adaptive radiation therapy, based on data from the first group of patients treated with the innovative RefleXion system.<br>" +
                    "Methods and Materials<br>" +
                    "For head and neck (HaN) and pelvic cancers, a deep convolutional segmentation network was initially trained on a population data set that contained 67 and 56 patient cases, respectively. Then the pretrained population network was adapted to the specific RefleXion patient by fine-tuning the network weights with a transfer learning method. For each of the 6 collected RefleXion HaN cases and 4 pelvic cases, initial planning computed tomography (CT) scans and 5 to 26 sets of daily kVCT images were used for the patient-specific learning and evaluation separately. The performance of the patient-specific network was compared with the population network and the clinical rigid registration method and evaluated by the Dice similarity coefficient (DSC) with manual contours being the reference. The corresponding dosimetric effects resulting from different auto-segmentation and registration methods were also investigated.<br>" +
                    "Results<br>" +
                    "The proposed patient-specific network achieved mean DSC results of 0.88 for 3 HaN organs at risk (OARs) of interest and 0.90 for 8 pelvic target and OARs, outperforming the population network (0.70 and 0.63) and the registration method (0.72 and 0.72). The DSC of the patient-specific network gradually increased with the increment of longitudinal training cases and approached saturation with more than 6 training cases. Compared with using the registration contour, the target and OAR mean doses and dose-volume histograms obtained using the patient-specific auto-segmentation were closer to the results using the manual contour.<br>" +
                    "Conclusions<br>" +
                    "Auto-segmentation of RefleXion kVCT images based on the patient-specific transfer learning could achieve higher accuracy, outperforming a common population network and clinical registration-based method. This approach shows promise in improving dose evaluation accuracy in RefleXion adaptive radiation therapy.<br>}",
                    "https://www.sciencedirect.com/science/article/abs/pii/S0360301623004339"
                    )

        add_paper("Masked Autoencoders Enable Efficient Knowledge Distillers",
                    "Yutong Bai, Zeyu Wang, Junfei Xiao, Chen Wei, Huiyu Wang, Alan Yuille, Yuyin Zhou, Cihang Xie",
                    "CVPR, 2023",
                    "https://arxiv.org/abs/2208.12256",
                    "@inproceedings{bai2022dmae,<br>" +
                     "&nbsp;&nbsp;&nbsp;title   = {Masked Autoencoders Enable Efficient Knowledge Distillers},<br>" +
                     "&nbsp;&nbsp;&nbsp;author  = {Bai, Yutong and Wang, Zeyu and Xiao, Junfei and Wei, Chen and Wang, Huiyu and Yuille, Alan and Zhou, Yuyin and Xie, Cihang},<br>" +
                     "&nbsp;&nbsp;&nbsp;booktitle = {CVPR},<br>" +
                     "&nbsp;&nbsp;&nbsp;year    = {2023}<br>}",
                    "This paper studies the potential of distilling knowledge from pre-trained models, especially Masked Autoencoders. Our approach is simple: in addition to optimizing the pixel reconstruction loss on masked inputs, we minimize the distance between the intermediate feature map of the teacher model and that of the student model. This design leads to a computationally efficient knowledge distillation framework, given 1) only a small visible subset of patches is used, and 2) the (cumbersome) teacher model only needs to be partially executed, i.e., forward propagate inputs through the first few layers, for obtaining intermediate feature maps. Compared to directly distilling fine-tuned models, distilling pre-trained models substantially improves downstream performance. For example, by distilling the knowledge from an MAE pre-trained ViT-L into a ViT-B, our method achieves 84.0% ImageNet top-1 accuracy, outperforming the baseline of directly distilling a fine-tuned ViT-L by 1.2%. More intriguingly, our method can robustly distill knowledge from teacher models even with extremely high masking ratios: e.g., with 95% masking ratio where merely TEN patches are visible during distillation, our ViT-B competitively attains a top-1 ImageNet accuracy of 83.6%; surprisingly, it can still secure 82.4% top-1 ImageNet accuracy by aggressively training with just FOUR visible patches (98% masking ratio). The code and models are publicly available at https://github.com/UCSC-VLAA/DMAE.",
                    "https://arxiv.org/abs/2208.12256",
                    "https://github.com/UCSC-VLAA/DMAE"
                )

        add_paper("Can CNNs Be More Robust Than Transformers?",
                    "Zeyu Wang, Yutong Bai, Yuyin Zhou, Cihang Xie",
                    "ICLR, 2023",
                    "https://arxiv.org/abs/2206.03452",
                    "@inproceedings{wang2022robustcnn,<br>" +
                     "&nbsp;&nbsp;&nbsp;title   = {Can CNNs Be More Robust Than Transformers?},<br>" +
                     "&nbsp;&nbsp;&nbsp;author  = {Wang, Zeyu and Bai, Yutong and Zhou, Yuyin and Xie, Cihang},<br>" +
                     "&nbsp;&nbsp;&nbsp;booktitle = {ICLR},<br>" +
                     "&nbsp;&nbsp;&nbsp;year    = {2023}<br>}",
                    "The recent success of Vision Transformers is shaking the long dominance of Convolutional Neural Networks (CNNs) in image recognition for a decade. Specifically, in terms of robustness on out-ofdistribution samples, recent research finds that Transformers are inherently more robust than CNNs, regardless of different training setups. Moreover, it is believed that such superiority of Transformers should largely be credited to their self-attention-like architectures per se. In this paper, we question that belief by closely examining the design of Transformers. Our findings lead to three highly effective architecture designs for boosting robustness, yet simple enough to be implemented in several lines of code, namely a) patchifying input images, b) enlarging kernel size, and c) reducing activation layers and normalization layers. Bringing these components together, we are able to build pure CNN architectures without any attention-like operations that is as robust as, or even more robust than, Transformers. We hope this work can help the community better understand the design of robust neural architectures. The code is publicly available at https://github.com/UCSC-VLAA/RobustCNN.",
                    "https://arxiv.org/abs/2206.03452",
                    "https://github.com/UCSC-VLAA/RobustCNN"
                )

        add_paper("One-Pixel Shortcut: on the Learning Preference of Deep Neural Networks",
                    "Shutong Wu, Sizhe Chen, Cihang Xie, Xiaolin Huang",
                            "ICLR, 2023",
                    "https://arxiv.org/abs/2205.12141",
                    "@inproceedings{wu2022onepixel,<br>" +
                      "&nbsp;&nbsp;&nbsp;title   = {One-Pixel Shortcut: on the Learning Preference of Deep Neural Networks},<br>" +
                      "&nbsp;&nbsp;&nbsp;author  = {Wu, Shutong and Chen, Sizhe and Xie, Cihang and Huang, Xiaolin},<br>" +
                      "&nbsp;&nbsp;&nbsp;booktitle = {ICLR},<br>" +
                      "&nbsp;&nbsp;&nbsp;year    = {2023}<br>}",
                    "Unlearnable examples (ULEs) aim to protect data from unauthorized usage for training DNNs. Error-minimizing noise, which is injected to clean data, is one of the most successful methods for preventing DNNs from giving correct predictions on incoming new data. Nonetheless, under specific training strategies such as adversarial training, the unlearnability of error-minimizing noise will severely degrade. In addition, the transferability of error-minimizing noise is inherently limited by the mismatch between the generator model and the targeted learner model. In this paper, we investigate the mechanism of unlearnable examples and propose a novel model-free method, named One-Pixel Shortcut, which only perturbs a single pixel of each image and makes the dataset unlearnable. Our method needs much less computational cost and obtains stronger transferability and thus can protect data from a wide range of different models. Based on this, we further introduce the first unlearnable dataset called CIFAR-10-S, which is indistinguishable from normal CIFAR-10 by human observers and can serve as a benchmark for different models or training strategies to evaluate their abilities to extract critical features from the disturbance of non-semantic representations. The original error-minimizing ULEs will lose efficiency under adversarial training, where the model can get over 83% clean test accuracy. Meanwhile, even if adversarial training and strong data augmentation like RandAugment are applied together, the model trained on CIFAR-10-S cannot get over 50% clean test accuracy.",
                    "https://arxiv.org/abs/2205.12141"
                )

        add_paper("Label-Efficient Self-Supervised Federated Learning for Tackling Data Heterogeneity in Medical Imaging",
                    "Rui Yan, Liangqiong Qu, Qingyue Wei, Shih-Cheng Huang, Liyue Shen, Daniel Rubin, Lei Xing, Yuyin Zhou",
                    "IEEE TMI, 2023",
                    "https://arxiv.org/abs/2205.08576",
                    "@article{yan2022label,<br>" +
                     "&nbsp;&nbsp;&nbsp;title     = {Label-Efficient Self-Supervised Federated Learning for Tackling Data Heterogeneity in Medical Imaging},<br>" +
                     "&nbsp;&nbsp;&nbsp;author    = {Yan, Rui and Qu, Liangqiong and Wei, Qingyue and Huang, Shih-Cheng and Shen, Liyue and Rubin, Daniel and Xing, Lei and Zhou, Yuyin},<br>" +
                     "&nbsp;&nbsp;&nbsp;journal   = {IEEE Transactions on Medical Imaging},<br>" +
                     "&nbsp;&nbsp;&nbsp;year      = {2023}<br>}",
                    "The curation of large-scale medical datasets from multiple institutions necessary for training deep learning models is challenged by the difficulty in sharing patient data with privacy-preserving. Federated learning (FL), a paradigm that enables privacy-protected collaborative learning among different institutions, is a promising solution to this challenge. However, FL generally suffers from performance deterioration due to heterogeneous data distributions across institutions and the lack of quality labeled data. In this paper, we present a robust and label-efficient self-supervised FL framework for medical image analysis. Specifically, we introduce a novel distributed selfsupervised pre-training paradigm into the existing FL pipeline (i.e., pre-training the models directly on the decentralized target task datasets). Built upon the recent success of Vision Transformers, we employ masked image encoding tasks for self-supervised pre-training, to facilitate more effective knowledge transfer to downstream federated models. Extensive empirical results on simulated and real-world medical imaging federated datasets show that self-supervised pre-training largely benefits the robustness of federated models against various degrees of data heterogeneity. Notably, under severe data heterogeneity, our method, without relying on any additional pre-training data, achieves an improvement of 5.06%, 1.53% and 4.58% in test accuracy on retinal, dermatology and chest X-ray classification compared with the supervised baseline with ImageNet pre-training. Moreover, we show that our self-supervised FL algorithm generalizes well to out-of-distribution data and learns federated models more effectively in limited label scenarios, surpassing the supervised baseline by 10.36% and the semi-supervised FL method by 8.3% in test accuracy. The code and trained models are available at https://github.com/rui-yan/SSL-FL.",
                    "https://arxiv.org/abs/2205.08576",
                    "https://github.com/rui-yan/SSL-FL"
                )


        add_paper("BNET: Batch Normalization With Enhanced Linear Transformation",
                    "Yuhui Xu, Lingxi Xie, Cihang Xie, Wenrui Dai, Jieru Mei, Siyuan Qiao, Wei Shen, Hongkai Xiong, Alan Yuille",
                    "IEEE TPAMI, 2023",
                    "https://arxiv.org/abs/2011.14150",
                    "@article{xu2023bnet,<br>" +
                     "&nbsp;&nbsp;&nbsp;author  = {Xu, Yuhui and Xie, Lingxi and Xie, Cihang and Dai, Wenrui and Mei, Jieru and Qiao, Siyuan and Shen, Wei and Xiong, Hongkai and Yuille, Alan},<br>" +
                     "&nbsp;&nbsp;&nbsp;title   = {BNET: Batch Normalization With Enhanced Linear Transformation},<br>" +
                     "&nbsp;&nbsp;&nbsp;journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},<br>" +
                     "&nbsp;&nbsp;&nbsp;year    = {2023}<br>}",
                    "Batch normalization (BN) is a fundamental unit in modern deep neural networks. However, BN and its variants focus on normalization statistics but neglect the recovery step that uses linear transformation to improve the capacity of fitting complex data distributions. In this paper, we demonstrate that the recovery step can be improved by aggregating the neighborhood of each neuron rather than just considering a single neuron. Specifically, we propose a simple yet effective method named batch normalization with enhanced linear transformation (BNET) to embed spatial contextual information and improve representation ability. BNET can be easily implemented using the depth-wise convolution and seamlessly transplanted into existing architectures with BN. To our best knowledge, BNET is the first attempt to enhance the recovery step for BN. Furthermore, BN is interpreted as a special case of BNET from both spatial and spectral views. Experimental results demonstrate that BNET achieves consistent performance gains based on various backbones in a wide range of visual tasks. Moreover, BNET can accelerate the convergence of network training and enhance spatial information by assigning important neurons with large weights accordingly. The code is available at https://github.com/yuhuixu1993/BNET.",
                    "https://arxiv.org/abs/2011.14150",
                    "https://github.com/yuhuixu1993/BNET"
        )

        add_paper("Practical Disruption of Image Translation Deepfake Networks",
                    "Nataniel Ruiz, Sarah Adel Bargal, Cihang Xie, Stan Sclaroff",
                    "AAAI, 2023",
                    "",
                    "@inproceedings{nataniel2023practical,<br>" +
                     "&nbsp;&nbsp;&nbsp;title     = {Practical Disruption of Image Translation Deepfake Networks},<br>" +
                     "&nbsp;&nbsp;&nbsp;author    = {Ruiz, Nataniel and Bargal, Sarah Adel and Xie, Cihang and Sclaroff, Stan},<br>" +
                     "&nbsp;&nbsp;&nbsp;booktitle = {AAAI},<br>" +
                     "&nbsp;&nbsp;&nbsp;year      = {2023}<br>}",
                    "By harnessing the latest advances in deep learning, image-to-image translation architectures have recently achieved impressive capabilities. Unfortunately, the growing representational power of these architectures has prominent unethical uses. Among these, the threats of (1) face manipulation (\"DeepFakes\") used for misinformation or pornographic use (2) \"DeepNude\" manipulations of body images to remove clothes from individuals, etc. Several works tackle the task of disrupting such image translation networks by inserting imperceptible adversarial attacks into the input image. Nevertheless, these works have limitations that may result in disruptions that are not practical in the real world. Specifically, most works generate disruptions in a white-box scenario, assuming perfect knowledge about the image translation network. The few remaining works that assume a black-box scenario require a large number of queries to successfully disrupt the adversary's image translation network. In this work we propose Leaking Transferable Perturbations (LTP), an algorithm that significantly reduces the number of queries needed to disrupt an image translation network by dynamically re-purposing previous disruptions into new query efficient disruptions.",
                    "https://ojs.aaai.org/index.php/AAAI/article/view/26693"
                )



        document.write("</ul><br>")
        document.write("<h1>2022</h1>")
        document.write("<ul>")


        add_paper("Finding Differences Between Transformers and ConvNets Using Counterfactual Simulation Testing",
            "Nataniel Ruiz, Cihang Xie, Sarah Adel Bargal, Kate Saenko, Stan Sclaroff",
            "NeurIPS, 2022",
            "https://arxiv.org/abs/2211.16499",
            "@inproceedings{nataniel2022counterfactualtest,<br>" +
              "&nbsp;&nbsp;&nbsp;title     = {Finding Differences Between Transformers and ConvNets Using Counterfactual Simulation Testing},<br>" +
              "&nbsp;&nbsp;&nbsp;author    = {Ruiz, Nataniel and Xie, Cihang and Bargal, Sarah Adel and Saenko, Kate and Sclaroff, Stan},<br>" +
              "&nbsp;&nbsp;&nbsp;booktitle = {NeurIPS},<br>" +
              "&nbsp;&nbsp;&nbsp;year      = {2022}<br>}",
            "Modern deep neural networks tend to be evaluated on static test sets. One shortcoming of this is the fact that these deep neural networks cannot be easily evaluated for robustness issues with respect to specific scene variations. For example, it is hard to study the robustness of these networks to variations of object scale, object pose, scene lighting and 3D occlusions. The main reason is that collecting real datasets with fine-grained naturalistic variations of sufficient scale can be extremely time-consuming and expensive. In this work, we present Counterfactual Simulation Testing, a counterfactual framework that allows us to study the robustness of neural networks with respect to some of these naturalistic variations by building realistic synthetic scenes that allow us to ask counterfactual questions to the models, ultimately providing answers to questions such as 'Would your classification still be correct if the object were viewed from the top?'' or 'Would your classification still be correct if the object were partially occluded by another object?'. Our method allows for a fair comparison of the robustness of recently released, state-of-the-art Convolutional Neural Networks and Vision Transformers, with respect to these naturalistic variations. We find evidence that ConvNext is more robust to pose and scale variations than Swin, that ConvNext generalizes better to our simulated domain and that Swin handles partial occlusion better than ConvNext. We also find that robustness for all networks improves with network scale and with data scale and variety. We release the Naturalistic Variation Object Dataset (NVD), a large simulated dataset of 272k images of everyday objects with naturalistic variations such as object pose, scale, viewpoint, lighting and occlusions. Project page: https://counterfactualsimulation.github.io/.",
                    "https://arxiv.org/abs/2211.16499",
                    "https://counterfactualsimulation.github.io"
                )


        add_paper("Adversarial Attack on Attackers: Post-Process to Mitigate Black-Box Score-Based Query Attacks",
            "Sizhe Chen, Zhehao Huang, Qinghua Tao, Yingwen Wu, Cihang Xie, Xiaolin Huang",
            "NeurIPS, 2022",
            "https://arxiv.org/abs/2205.12134",
            "@inproceedings{chen2022AAA,<br>" +
              "&nbsp;&nbsp;&nbsp;title     = {Adversarial Attack on Attackers: Post-Process to Mitigate Black-Box Score-Based Query Attacks},<br>" +
              "&nbsp;&nbsp;&nbsp;author    = {Chen, Sizhe and Huang, Zhehao and Tao, Qinghua and Wu, Yingwen and Xie, Cihang and Huang, Xiaolin},<br>" +
              "&nbsp;&nbsp;&nbsp;booktitle = {NeurIPS},<br>" +
              "&nbsp;&nbsp;&nbsp;year      = {2022}<br>}",
            "The score-based query attacks (SQAs) pose practical threats to deep neural networks by crafting adversarial perturbations within dozens of queries, only using the model’s output scores. Nonetheless, we note that if the loss trend of the outputs is slightly perturbed, SQAs could be easily misled and thereby become much less effective. Following this idea, we propose a novel defense, namely Adversarial Attack on Attackers (AAA), to confound SQAs towards incorrect attack directions by slightly modifying the output logits. In this way, (1) SQAs are prevented regardless of the model’s worst-case robustness; (2) the original model predictions are hardly changed, i.e., no degradation on clean accuracy; (3) the calibration of confidence scores can be improved simultaneously. Extensive experiments are provided to verify the above advantages. For example, by setting ∞ = 8/255 on CIFAR-10, our proposed AAA helps WideResNet-28 secure 80.59% accuracy under Square attack (2500 queries), while the best prior defense (i.e., adversarial training) only attains 67.44%. Since AAA attacks SQA’s general greedy strategy, such advantages of AAAover 8 defenses can be consistently observed on 8 CIFAR-10/ImageNet models under 6 SQAs, using different attack targets, bounds, norms, losses, and strategies. Moreover, AAA calibrates better without hurting the accuracy.",
                    "https://arxiv.org/abs/2205.12134"
                )

        add_paper("Mitigating Lies in Vision-Language Models",
          "Junbo Li, Xianhang Li, Cihang Xie",
          "NeurIPS ML Safety Workshop, 2022",
          "https://openreview.net/forum?id=mAiTuIeWbxD",
          "@inproceedings{li2023mitigating,<br>" +
          "&nbsp;&nbsp;&nbsp;title     = {Mitigating Lies in Vision-Language Models},<br>" +
          "&nbsp;&nbsp;&nbsp;author    = {Li, Junbo and Li, Xianhang and Xie, Cihang},<br>" +
          "&nbsp;&nbsp;&nbsp;booktitle = {NeurIPS ML Safety Workshop},<br>" +
          "&nbsp;&nbsp;&nbsp;year      = {2022}<br>}",
          "In this work, we bring new insights into the honesty of vision-language models, particularly in visual question answering (VQA). After a thorough revisit of the existing ‘lie’ behavior in pure language models, our work makes an unprecedented extension of ‘lies’ to vision-language models. The results indicate that the lie prefixes have a more obvious misleading effect on vision-language models than on language models. We also propose a novel visual prefix and prove that the consistent vision-language prefix is more threatening to vision-language models. To defend the models from the stated ‘lies’, we put forward an unsupervised framework based on Gaussian mixture modeling and obtain improvement with 3% against the language prefix and 12% against the vision-language prefix.",
          "https://openreview.net/forum?id=mAiTuIeWbxD"
          )



        add_paper("Multi-Granularity Cross-modal Alignment for Generalized Medical Visual Representation Learning",
             "Fuying Wang, Yuyin Zhou, Shujun Wang, Varut Vardhanabhuti, Lequan Yu",
             "NeurIPS, 2022",
             "https://arxiv.org/abs/2210.06044",
             "@inproceedings{wang2022multi,<br>" +
               "&nbsp;&nbsp;&nbsp;title     = {Multi-Granularity Cross-modal Alignment for Generalized Medical Visual Representation Learning},<br>" +
               "&nbsp;&nbsp;&nbsp;author    = {Wang, Fuying and Zhou, Yuyin and Wang, Shujun and Vardhanabhuti, Varut and Yu, Lequan},<br>" +
               "&nbsp;&nbsp;&nbsp;booktitle = {NeurIPS},<br>" +
               "&nbsp;&nbsp;&nbsp;year      = {2022}<br>}",
             "Learning medical visual representations directly from paired radiology reports has become an emerging topic in representation learning. However, existing medical image-text joint learning methods are limited by instance or local supervision analysis, ignoring disease-level semantic correspondences. In this paper, we present a novel Multi-Granularity Cross-modal Alignment (MGCA) framework for generalized medical visual representation learning by harnessing the naturally exhibited semantic correspondences between medical image and radiology reports at three different levels, i.e., pathological region-level, instance-level, and disease-level. Specifically, we first incorporate the instance-wise alignment module by maximizing the agreement between image-report pairs. Further, for token-wise alignment, we introduce a bidirectional cross-attention strategy to explicitly learn the matching between fine-grained visual tokens and text tokens, followed by contrastive learning to align them. More important, to leverage the high-level inter-subject relationship semantic (e.g., disease) correspondences, we design a novel cross-modal disease-level alignment paradigm to enforce the cross-modal cluster assignment consistency. Extensive experimental results on seven downstream medical image datasets covering image classification, object detection, and semantic segmentation tasks demonstrate the stable and superior performance of our framework.",
                    "https://arxiv.org/abs/2210.06044",
                    "https://github.com/fuying-wang/mgca"
                )



        add_paper("In Defense of Image Pre-Training for Spatiotemporal Recognition",
              "Xianhang Li, Huiyu Wang, Chen Wei, Jieru Mei, Alan Yuille, Yuyin Zhou, Cihang Xie",
              "ECCV, 2022",
              "https://arxiv.org/abs/2205.01721",
              "@inproceedings{li2022videopretraining,<br>" +
              "&nbsp;&nbsp;&nbsp;title     = {In Defense of Image Pre-Training for Spatiotemporal Recognition},<br>" +
              "&nbsp;&nbsp;&nbsp;author    = {Xianhang Li and Huiyu Wang and Chen Wei and Jieru Mei and Alan Yuille and Yuyin       Zhou and Cihang Xie},<br>" +
              "&nbsp;&nbsp;&nbsp;booktitle = {ECCV},<br>" +
              "&nbsp;&nbsp;&nbsp;year      = {2022}<br>}" ,
              "Image pre-training, the current de-facto paradigm for a wide range of visual tasks, is generally less favored in the field of video recognition. By contrast, a common strategy is to directly train with spatiotemporal convolutional neural networks (CNNs) from scratch. Nonetheless, interestingly, by taking a closer look at these from-scratch learned CNNs, we note there exist certain 3D kernels that exhibit much stronger appearance modeling ability than others, arguably suggesting appearance information is already well disentangled in learning. Inspired by this observation, we hypothesize that the key to effectively leveraging image pre-training lies in the decomposition of learning spatial and temporal features, and revisiting image pre-training as the appearance prior to initializing 3D kernels. In addition, we propose Spatial-Temporal Separable (STS) convolution, which explicitly splits the feature channels into spatial and temporal groups, to further enable a more thorough decomposition of spatiotemporal features for fine-tuning 3D CNNs. Our experiments show that simply replacing 3D convolution with STS notably improves a wide range of 3D CNNs without increasing parameters and computation on both Kinetics-400 and Something-Something V2. Moreover, this new training pipeline consistently achieves better results on video recognition with significant speedup. For instance, we achieve +0.6% top-1 of Slowfast on Kinetics-400 over the strong 256epoch 128-GPU baseline while fine-tuning for only 50 epochs with 4 GPUs. The code and models are available at github.com/UCSC-VLAA/ Image-Pretraining-for-Video.",
                    "https://arxiv.org/abs/2205.01721",
                    "https://github.com/UCSC-VLAA/Image-Pretraining-for-Video"
                )

        add_paper("ViP: Unified Certified Detection and Recovery for Patch Attack with Vision Transformers",
             "Junbo Li, Huan Zhang, Cihang Xie",
             "ECCV, 2022",
             "https://link.springer.com/chapter/10.1007/978-3-031-19806-9_33",
             "@inproceedings{li2022vip,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {ViP: Unified Certified Detection and Recovery for Patch Attack with Vision Transformers}, <br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Junbo Li and Huan Zhang and Cihang Xie},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {ECCV},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2022}<br>}",
             "Patch attack, which introduces a perceptible but localized change to the input image, has gained significant momentum in recent years. In this paper, we present a unified framework to analyze certified patch defense tasks, including both certified detection and certified recovery, leveraging the recently emerged Vision Transformers (ViTs). In addition to the existing patch defense setting where only one patch is considered, we provide the very first study on developing certified detection against the dual patch attack, in which the attacker is allowed to adversarially manipulate pixels in two different regions. By building upon the latest progress in self-supervised ViTs with masked image modeling (i.e., masked autoencoder (MAE)), our method achieves state-of-the-art performance in both certified detection and certified recovery of adversarial patches. Regarding certified detection, we improve the performance by up to ∼16% on ImageNet without training on a single adversarial patch, and for the first time, can also tackle the more challenging dual patch setting. Our method largely closes the gap between detection-based certified robustness and clean image accuracy. Regarding certified recovery, our approach improves certified accuracy by ∼2% on ImageNet across all attack sizes, attaining the new state-of-the-art performance.",
                    "https://link.springer.com/chapter/10.1007/978-3-031-19806-9_33"
                )

        add_paper("Simulated Adversarial Testing of Face Recognition Models",
            "Nataniel Ruiz, Adam Kortylewski, Weichao Qiu, Cihang Xie , Sarah Adel Bargal, Alan Yuille, Stan Sclaroff",
            "CVPR, 2022",
            "https://arxiv.org/abs/2106.04569",
            "@inproceedings{nataniel2021testface,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Simulated Adversarial Testing of Face Recognition Models},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Ruiz, Nataniel and Kortylewski, Adam and Qiu, Weichao and Xie, Cihang and Bargal, Sarah Adel and Yuille, Alan and Sclaroff, Stan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {CVPR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2022}<br>}",
            "Most machine learning models are validated and tested on fixed datasets. This can give an incomplete picture of the capabilities and weaknesses of the model. Such weaknesses can be revealed at test time in the real world. The risks involved in such failures can be loss of profits, loss of time or even loss of life in certain critical applications. In order to alleviate this issue, simulators can be controlled in a finegrained manner using interpretable parameters to explore the semantic image manifold. In this work, we propose a framework for learning how to test machine learning algorithms using simulators in an adversarial manner in order to find weaknesses in the model before deploying it in critical scenarios. We apply this method in a face recognition setup. We show that certain weaknesses of models trained on real data can be discovered using simulated samples. Using our proposed method, we can find adversarial synthetic faces that fool contemporary face recognition models. This demonstrates the fact that these models have weaknesses that are not measured by commonly used validation datasets. We hypothesize that this type of adversarial examples are not isolated, but usually lie in connected spaces in the latent space of the simulator. We present a method to f ind these adversarial regions as opposed to the typical adversarial points found in the adversarial example literature.",
            "https://arxiv.org/abs/2106.04569"
        )

        add_paper("A Simple Data Mixing Prior for Improving Self-Supervised Learning",
            "Sucheng Ren, Huiyu Wang, Zhengqi Gao, Shengfeng He, Alan Yuille, Yuyin Zhou, Cihang Xie",
            "CVPR, 2022",
            "https://arxiv.org/abs/2206.07692",
            "@inproceedings{ren2022sdmp,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {A Simple Data Mixing Prior for Improving Self-Supervised Learning},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Ren, Sucheng and Wang, Huiyu and Gao, Zhengqi and He, Shengfeng and Yuille, Alan and Zhou, Yuyin and Xie, Cihang},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {CVPR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2022}<br>}",
            "Data mixing (e.g., Mixup, Cutmix, ResizeMix) is an essential component for advancing recognition models. In this paper, we focus on studying its effectiveness in the self-supervised setting. By noticing the mixed images that share the same source images are intrinsically related to each other, we hereby propose SDMP, short for Simple Data Mixing Prior, to capture this straightforward yet essential prior, and position such mixed images as additional positive pairs to facilitate self-supervised representation learning. Our experiments verify that the proposed SDMP enables data mixing to help a set of self-supervised learning frameworks (e.g., MoCo) achieve better accuracy and out-ofdistribution robustness. More notably, our SDMP is the first method that successfully leverages data mixing to improve (rather than hurt) the performance of Vision Transformers in the self-supervised setting. Code is publicly available at https://github.com/OliverRensu/SDMP.",
            "https://arxiv.org/abs/2206.07692",
            "https://github.com/OliverRensu/SDMP"
        )

        add_paper("CD2-pFed: Cyclic Distillation-guided Channel Decoupling for Model Personalization in Federated Learning",
            "Yiqing Shen, Yuyin Zhou, Lequan Yu",
            "CVPR, 2022",
            "https://arxiv.org/abs/2204.03880",
            "@inproceedings{shen2022cd2,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {CD2-pFed: Cyclic Distillation-guided Channel Decoupling for Model Personalization in Federated Learning},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Shen, Yiqing and Zhou, Yuyin and Yu, Lequan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {CVPR},<br>" +
             "&nbsp;&nbsp;&nbsp;<br>year      = {2022}<br>}",
            "Federated learning (FL) is a distributed learning paradigm that enables multiple clients to collaboratively learn a shared global model. Despite the recent progress, it remains challenging to deal with heterogeneous data clients, as the discrepant data distributions usually prevent the global model from delivering good generalization ability on each participating client. In this paper, we propose CD^2-pFed, a novel Cyclic Distillation-guided Channel Decoupling framework, to personalize the global model in FL, under various settings of data heterogeneity. Different from previous works which establish layer-wise personalization to overcome the non-IID data across different clients, we make the first attempt at channel-wise assignment for model personalization, referred to as channel decoupling. To further facilitate the collaboration between private and shared weights, we propose a novel cyclic distillation scheme to impose a consistent regularization between the local and global model representations during the federation. Guided by the cyclical distillation, our channel decoupling framework can deliver more accurate and generalized results for different kinds of heterogeneity, such as feature skew, label distribution skew, and concept shift. Comprehensive experiments on four benchmarks, including natural image and medical image analysis tasks, demonstrate the consistent effectiveness of our method on both local and external validations.",
            "https://arxiv.org/abs/2204.03880"
        )   

        add_paper("Rethinking Architecture Design for Tackling Data Heterogeneity in Federated Learning",
            "Liangqiong Qu, Yuyin Zhou, Paul Pu Liang, Yingda Xia, Feifei Wang, Ehsan Adeli, Li Fei-Fei, Daniel Rubin",
            "CVPR, 2022",
            "https://arxiv.org/abs/2106.06047",
            "@inproceedings{qu2022rethinking,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Rethinking Architecture Design for Tackling Data Heterogeneity in Federated Learning},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Qu, Liangqiong and Zhou, Yuyin and Liang, Paul Pu and Xia, Yingda and Wang, Feifei and Adeli, Ehsan and Fei-Fei, Li and Rubin, Daniel},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {CVPR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2022}<br>}",
            "Federated learning is an emerging research paradigm enabling collaborative training of machine learning models among different organizations while keeping data private at each institution. Despite recent progress, there remain fundamental challenges such as the lack of convergence and the potential for catastrophic forgetting across real-world heterogeneous devices. In this paper, we demonstrate that self-attention-based architectures (e.g., Transformers) are more robust to distribution shifts and hence improve federated learning over heterogeneous data. Concretely, we conduct the first rigorous empirical investigation of different neural architectures across a range of federated algorithms, real-world benchmarks, and heterogeneous data splits. Our experiments show that simply replacing convolutional networks with Transformers can greatly reduce catastrophic forgetting of previous devices, accelerate convergence, and reach a better global model, especially when dealing with heterogeneous data. We will release our code and pretrained models to encourage future exploration in robust architectures as an alternative to current research efforts on the optimization front.",
            "https://arxiv.org/abs/2106.06047"
        )            

       
        add_paper("Small-Object Sensitive Segmentation Using Across Feature Map Attention",
            "Shengtian Sang, Yuyin Zhou, Md Tauhidul Islam, Lei Xing",
            "IEEE TPAMI, 2022",
            "https://ieeexplore.ieee.org/abstract/document/9906428",
            "@article{sang2022small,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Small-Object Sensitive Segmentation Using Across Feature Map Attention},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Sang, Shengtian and Zhou, Yuyin and Islam, Md Tauhidul and Xing, Lei},<br>" +
             "&nbsp;&nbsp;&nbsp;journal   = {IEEE Transactions on Pattern Analysis and Machine Intelligence},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2022}<br>}",
            "Semantic segmentation is an important step in understanding the scene for many practical applications such as autonomous driving. Although Deep Convolutional Neural Networks-based methods have significantly improved segmentation accuracy, small/thin objects remain challenging to segment due to convolutional and pooling operations that result in information loss, especially for small objects. This paper presents a novel attention-based method called Across Feature Map Attention (AFMA) to address this challenge. It quantifies the inner-relationship between small and large objects belonging to the same category by utilizing the different feature levels of the original image. The AFMA could compensate for the loss of high-level feature information of small objects and improve the small/thin object segmentation. Our method can be used as an efficient plug-in for a wide range of existing architectures and produces much more interpretable feature representation than former studies. Extensive experiments on eight widely used segmentation methods and other existing small-object segmentation models on CamVid and Cityscapes demonstrate that our method substantially and consistently improves the segmentation of small/thin objects.",
            "https://ieeexplore.ieee.org/abstract/document/9906428"
        )

        add_paper("CateNorm: Categorical Normalization for Robust Medical Image Segmentation",
            "Junfei Xiao, Lequan Yu, Zongwei Zhou, Yutong Bai, Lei Xing, Alan Yuille, Yuyin Zhou",
            "MICCAI Workshop on Domain Adaptation and Representation Transfer, 2022",
            "https://arxiv.org/abs/2103.15858",
            "@inproceedings{xiao2022catenorm,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {CateNorm: Categorical Normalization for Robust Medical Image Segmentation},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Xiao, Junfei and Yu, Lequan and Zhou, Zongwei and Bai, Yutong and Xing, Lei and Yuille, Alan and Zhou, Yuyin},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {MICCAI Workshop on Domain Adaptation and Representation Transfer},<br>"  +
             "&nbsp;&nbsp;&nbsp;year      = {2022}<br>}" ,
            "Batch normalization (BN) uniformly shifts and scales the activations based on the statistics of a batch of images. However, the intensity distribution of the background pixels often dominates the BN statistics because the background accounts for a large proportion of the entire image. This paper focuses on enhancing BN with the intensity distribution of foreground pixels, the one that really matters for image segmentation. We propose a new normalization strategy, named categorical normalization (CateNorm), to normalize the activations according to categorical statistics. The categorical statistics are obtained by dynamically modulating specific regions in an image that belong to the foreground. CateNorm demonstrates both precise and robust segmentation results across five public datasets obtained from different domains, covering complex and variable data distributions. It is attributable to the ability of CateNorm to capture domain-invariant information from multiple domains (institutions) of medical data.",
            "https://arxiv.org/abs/2103.15858"
        )        

        add_paper("Multiple Instance Neuroimage Transformer",
            "Ayush Singla, Qingyu Zhao, Daniel K Do, Yuyin Zhou, Kilian M Pohl, Ehsan Adeli",
            "MICCAI Workshop on Predictive Intelligence in Medicine, 2022",
            "https://arxiv.org/abs/2208.09567",
            "@inproceedings{singla2022multiple,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Multiple Instance Neuroimage Transformer},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Singla, Ayush and Zhao, Qingyu and Do, Daniel K and Zhou, Yuyin and Pohl, Kilian M and Adeli, Ehsan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {MICCAI Workshop on Predictive Intelligence in Medicine},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2022}<br>}",
            "For the first time, we propose using a multiple instance learning based convolution-free transformer model, called Multiple Instance Neuroimage Transformer (MINiT), for the classification of T1-weighted (T1w) MRIs. We first present several variants of transformer models adopted for neuroimages. These models extract non-overlapping 3D blocks from the input volume and perform multi-headed self-attention on a sequence of their linear projections. MINiT, on the other hand, treats each of the non-overlapping 3D blocks of the input MRI as its own instance, splitting it further into non-overlapping 3D patches, on which multi-headed self-attention is computed. As a proof-of-concept, we evaluate the efficacy of our model by training it to identify sex from T1w-MRIs of two public datasets: Adolescent Brain Cognitive Development (ABCD) and the National Consortium on Alcohol and Neurodevelopment in Adolescence (NCANDA). The learned attention maps highlight voxels contributing to identifying sex differences in brain morphometry. The code is available at https://github.com/singlaayush/MINIT.",
            "https://arxiv.org/abs/2208.09567",
            "https://github.com/singlaayush/MINIT"
        )       

        add_paper("Fast AdvProp",
            "Jieru Mei, Yucheng Han, Yutong Bai, Yixiao Zhang, Yingwei Li, Xianhang Li, Alan Yuille, Cihang Xie",
            "ICLR, 2022",
            "https://arxiv.org/abs/2204.09838",
            "@inproceedings{mei2022fastadvprop,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Fast AdvProp},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Mei, Jieru and Han, Yucheng and Bai, Yutong and Zhang, Yixiao and Li, Yingwei and Li, Xianhang and Yuille, Alan and Xie, Cihang},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {ICLR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2022}<br>}",
            "Adversarial Propagation (AdvProp) is an effective way to improve recognition models, leveraging adversarial examples. Nonetheless, AdvProp suffers from the extremely slow training speed, mainly because: a) extra forward and backward passes are required for generating adversarial examples; b) both original samples and their adversarial counterparts are used for training (i.e., 2× data). In this paper, we introduce Fast AdvProp, which aggressively revamps AdvProp’s costly training components, rendering the method nearly as cheap as the vanilla training. Specifically, our modifications in Fast AdvProp are guided by the hypothesis that disentangled learning with adversarial examples is the key for performance improvements, while other training recipes (e.g., paired clean and adversarial training samples, multi-step adversarial attackers) could be largely simplified. Our empirical results show that, compared to the vanilla training baseline, Fast AdvProp is able to further model performance on a spectrum of visual benchmarks, without incurring extra training cost. Additionally, our ablations find Fast AdvProp scales better if larger models are used, is compatible with existing data augmentation methods (i.e., Mixup and CutMix), and can be easily adapted to other recognition tasks like object detection. The code is available here: https://github.com/meijieru/fast_advprop.",
            "https://arxiv.org/abs/2204.09838",
            "https://github.com/meijieru/fast_advprop"
        )

        add_paper("iBOT: Image BERT Pre-Training with Online Tokenizer",
            "Jinghao Zhou, Chen Wei, Huiyu Wang, Wei Shen, Cihang Xie, Alan Yuille, Tao Kong",
            "ICLR, 2022",
            "https://arxiv.org/abs/2111.07832",
            "@inproceedings{zhou2021ibot,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {iBOT: Image BERT Pre-Training with Online Tokenizer},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Zhou, Jinghao and Wei, Chen and Wang, Huiyu and Shen, Wei and Xie, Cihang and Yuille, Alan and Kong, Tao},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {ICLR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2022}<br>}",
            "The success of language Transformers is primarily attributed to the pretext task of masked language modeling (MLM) (Devlin et al., 2019), where texts are first tokenized into semantically meaningful pieces. In this work, we study masked image modeling (MIM) and indicate the advantages and challenges of using a semantically meaningful visual tokenizer. We present a self-supervised framework iBOT that can perform masked prediction with an online tokenizer. Specifically, we perform self-distillation on masked patch tokens and take the teacher network as the online tokenizer, along with self-distillation on the class token to acquire visual semantics. The online tokenizer is jointly learnable with the MIM objective and dispenses with a multi-stage training pipeline where the tokenizer needs to be pretrained beforehand. We show the prominence of iBOT by achieving an 82.3% linear probing accuracy and an 87.8% fine-tuning accuracy evaluated on ImageNet1K. Beyond the state-of-the-art image classification results, we underline emerging local semantic patterns, which helps the models to obtain strong robustness against common corruptions and achieve leading results on dense downstream tasks, e.g., object detection, instance segmentation, and semantic segmentation. The code and models are publicly available at https://github.com/bytedance/ibot.",
            "https://arxiv.org/abs/2111.07832",
            "https://github.com/bytedance/ibot",
            "https://medium.com/syncedreview/meet-ibot-a-masked-image-modelling-framework-that-enables-bert-like-pretraining-for-vision-da01002115e7"
        )
        document.write("</ul>")
        document.write("<h1>2021</h1>")
        document.write("<ul>")

        add_paper("Are Transformers More Robust Than CNNs?",
            "Yutong Bai, Jieru Mei, Alan Yuille, Cihang Xie",
            "NeurIPS, 2021",
            "https://arxiv.org/abs/2111.05464",
            "@inproceedings{bai2020vitsVScnns,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Are Transformers More Robust Than CNNs?},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Bai, Yutong and Mei, Jieru and Yuille, Alan and Xie, Cihang},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {NeurIPS},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2021}<br>}",
            "Transformer emerges as a powerful tool for visual recognition. In addition to demonstrating competitive performance on a broad range of visual benchmarks, recent works also argue that Transformers are much more robust than Convolutions Neural Networks (CNNs). Nonetheless, surprisingly, we find these conclusions are drawn from unfair experimental settings, where Transformers and CNNs are compared at different scales and are applied with distinct training frameworks. In this paper, we aim to provide the first fair & in-depth comparisons between Transformers and CNNs, focusing on robustness evaluations. With our unified training setup, we first challenge the previous belief that Transformers outshine CNNs when measuring adversarial robustness. More surprisingly, we find CNNs can easily be as robust as Transformers on defending against adversarial attacks, if they properly adopt Transformers' training recipes. While regarding generalization on out-of-distribution samples, we show pre-training on (external) large-scale datasets is not a fundamental request for enabling Transformers to achieve better performance than CNNs. Moreover, our ablations suggest such stronger generalization is largely benefited by the Transformer's self-attention-like architectures per se, rather than by other training setups. We hope this work can help the community better understand and benchmark the robustness of Transformers and CNNs. The code and models are publicly available at: https://github.com/ytongbai/ViTs-vs-CNNs.",
            "https://arxiv.org/abs/2111.05464",
            "https://github.com/ytongbai/ViTs-vs-CNNs"
        )

        add_paper("Calibrating Concepts and Operations: Towards Symbolic Reasoning on Real Images",
            "Zhuowan Li, Elias Stengel-Eskin, Yixiao Zhang, Cihang Xie, Quan Tran, Benjamin Van Durme, Alan Yuille",
            "ICCV, 2021",
            "https://arxiv.org/abs/2110.00519",
            "@inproceedings{li2021calico,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Calibrating Concepts and Operations: Towards Symbolic Reasoning on Real Images},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Li, Zhuowan and Stengel-Eskin, Elias and Zhang, Yixiao and Xie, Cihang and Tran, Quan and Van Durme, Benjamin and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {ICCV},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2021}<br>}",
            "While neural symbolic methods demonstrate impressive performance in visual question answering on synthetic images, their performance suffers on real images. We identify that the long-tail distribution of visual concepts and unequal importance of reasoning steps in real data are the two key obstacles that limit the models’ real-world potentials. To address these challenges, we propose a new paradigm, Calibrating Concepts and Operations (CCO), which enables neural symbolic models to capture underlying data characteristics and to reason with hierarchical importance. Specifically, we introduce an executor with learnable concept embedding magnitudes for handling distribution imbalance, and an operation calibrator for highlighting important operations and suppressing redundant ones. Ourexperiments show CCOsubstantially boosts the performance of neural symbolic methods on real images. By evaluating models on the real world dataset GQA, CCO helps the neural symbolic method NSCL outperforms its vanilla counterpart by 9.1% (from 47.0% to 56.1%); this result also largely reduces the performance gap between symbolic and non-symbolic methods. Additionally, we create a perturbed test set for better understanding and analyzing model performance on real images. Code is available at https://github.com/Lizw14/CaliCO",
            "https://arxiv.org/abs/2110.00519",
            "https://github.com/Lizw14/CaliCO"
        )

        add_paper("Robust and Accurate Object Detection via Adversarial Learning",
            "Xiangning Chen, Cihang Xie, Mingxing Tan, Li Zhang, Cho-Jui Hsieh, Boqing Gong",
            "CVPR, 2021",
            "https://arxiv.org/abs/2103.13886",
            "@inproceedings{chen2021robust,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Robust and Accurate Object Detection via Adversarial Learning},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Chen, Xiangning and Xie, Cihang and Tan, Mingxing and Zhang, Li and Hsieh, Cho-Jui and Gong, Boqing},<br>"+
             "&nbsp;&nbsp;&nbsp;booktitle = {CVPR},<br>" +
             "&nbsp;&nbsp;&nbsp;year   = {2021}<br>}",
            "Data augmentation has become a de facto component for training high-performance deep image classifiers, but its potential is under-explored for object detection. Noting that most state-of-the-art object detectors benefit from fine-tuning a pre-trained classifier, we first study how the classifiers' gains from various data augmentations transfer to object detection. The results are discouraging; the gains diminish after fine-tuning in terms of either accuracy or robustness. This work instead augments the fine-tuning stage for object detectors by exploring adversarial examples, which can be viewed as a model-dependent data augmentation. Our method dynamically selects the stronger adversarial images sourced from a detector's classification and localization branches and evolves with the detector to ensure the augmentation policy stays current and relevant. This model-dependent augmentation generalizes to different object detectors better than AutoAugment, a model-agnostic augmentation policy searched based on one particular detector. Our approach boosts the performance of state-of-the-art EfficientDets by +1.1 mAP on the COCO object detection benchmark. It also improves the detectors' robustness against natural distortions by +3.8 mAP and against domain shift by +1.3 mAP. Models are available at https://github.com/google/automl/blob/master/efficientdet/Det-AdvProp.md",
            "https://arxiv.org/abs/2103.13886",
            "https://github.com/google/automl/blob/master/efficientdet/Det-AdvProp.md"
        )



        add_paper("Pancreas CT Segmentation by Predictive Phenotyping",
            "Yucheng Tang, Riqiang Gao, Hohin Lee, Qi Yang, Xin Yu, Yuyin Zhou, Shunxing Bao, Yuankai Huo, Jeffrey Spraggins, Jack Virostko, Zhoubing Xu, Bennett A Landman",
            "MICCAI, 2021",
            "https://link.springer.com/chapter/10.1007/978-3-030-87193-2_3",
            "@inproceedings{tang2021pancreas,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Pancreas CT Segmentation by Predictive Phenotyping},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Tang, Yucheng and Gao, Riqiang and Lee, Hohin and Yang, Qi and Yu, Xin and Zhou, Yuyin and Bao, Shunxing and Huo, Yuankai and Spraggins, Jeffrey and Virostko, Jack and others},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {MICCAI},<br>" +
             "&nbsp;&nbsp;&nbsp; year     = {2021}<br>}",
            "Pancreas CT segmentation offers promise at understanding the structural manifestation of metabolic conditions. To date, the medical primary record of conditions that impact the pancreas is in the electronic health record (EHR) in terms of diagnostic phenotype data (e.g., ICD-10 codes). We posit that similar structural phenotypes could be revealed by studying subjects with similar medical outcomes. Segmentation is mainly driven by imaging data, but this direct approach may not consider differing canonical appearances with different underlying conditions (e.g., pancreatic atrophy versus pancreatic cysts). To this end, we exploit clinical features from EHR data to complement image features for enhancing the pancreas segmentation, especially in high-risk outcomes. Specifically, we propose, to the best of our knowledge, the first phenotype embedding model for pancreas segmentation by predicting representatives that share similar comorbidities. Such an embedding strategy can adaptively refine the segmentation outcome based on the discriminative contexts distilled from clinical features. Experiments with 2000 patients’ EHR data and 300 CT images with the healthy pancreas, type II diabetes, and pancreatitis subjects show that segmentation by predictive phenotyping significantly improves performance over state-of-the-arts (Dice score 0.775 to 0.791, p<0.05, Wilcoxon signed-rank test). The proposed method additionally achieves superior performance on two public testing datasets, BTCV MICCAI Challenge 2015 and TCIA pancreas CT. Our approach provides a promising direction of advancing segmentation with phenotype features while without requiring EHR data as input during testing.",
            "https://link.springer.com/chapter/10.1007/978-3-030-87193-2_3"
        ) 

         add_paper("Learning inductive attention guidance for partially supervised pancreatic ductal adenocarcinoma prediction",
            "Yan Wang, Peng Tang, Yuyin Zhou, Wei Shen, Elliot K Fishman, Alan Yuille",
            "IEEE TMI, 2021",
            "https://arxiv.org/abs/2105.14773",
            "@article{wang2021learning,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Learning inductive attention guidance for partially supervised pancreatic ductal adenocarcinoma prediction},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Wang, Yan and Tang, Peng and Zhou, Yuyin and Shen, Wei and Fishman, Elliot K and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;journal   = {IEEE Transactions on Medical Imaging},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2021}<br>}",
            "Pancreatic ductal adenocarcinoma (PDAC) is the third most common cause of cancer death in the United States. Predicting tumors like PDACs (including both classification and segmentation) from medical images by deep learning is becoming a growing trend, but usually a large number of annotated data are required for training, which is very labor-intensive and time-consuming. In this paper, we consider a partially supervised setting, where cheap image-level annotations are provided for all the training data, and the costly per-voxel annotations are only available for a subset of them. We propose an Inductive Attention Guidance Network (IAG-Net) to jointly learn a global image-level classifier for normal/PDAC classification and a local voxel-level classifier for semi-supervised PDAC segmentation. We instantiate both the global and the local classifiers by multiple instance learning (MIL), where the attention guidance, indicating roughly where the PDAC regions are, is the key to bridging them: For global MIL based normal/PDAC classification, attention serves as a weight for each instance (voxel) during MIL pooling, which eliminates the distraction from the background; For local MIL based semi-supervised PDAC segmentation, the attention guidance is inductive, which not only provides bag-level pseudo-labels to training data without per-voxel annotations for MIL training, but also acts as a proxy of an instance-level classifier. Experimental results show that our IAG-Net boosts PDAC segmentation accuracy by more than 5% compared with the state-of-the-arts.",
            "https://arxiv.org/abs/2105.14773"
        ) 

        add_paper("External Attention Assisted Multi-Phase Splenic Vascular Injury Segmentation With Limited Data",
            "Yuyin Zhou, David Dreizin, Yan Wang, Fengze Liu, Wei Shen, Alan Yuille",
            "IEEE TMI, 2021",
            "https://arxiv.org/abs/2201.00942",
            "@article{zhou2021external,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {External Attention Assisted Multi-Phase Splenic Vascular Injury Segmentation With Limited Data},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Zhou, Yuyin and Dreizin, David and Wang, Yan and Liu, Fengze and Shen, Wei and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;journal   = {IEEE Transactions on Medical Imaging},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2021},<br>",
            "The spleen is one of the most commonly injured solid organs in blunt abdominal trauma. The development of automatic segmentation systems from multi-phase CT for splenic vascular injury can augment severity grading for improving clinical decision support and outcome prediction. However, accurate segmentation of splenic vascular injury is challenging for the following reasons: 1) Splenic vascular injury can be highly variant in shape, texture, size, and overall appearance; and 2) Data acquisition is a complex and expensive procedure that requires intensive efforts from both data scientists and radiologists, which makes large-scale well-annotated datasets hard to acquire in general. In light of these challenges, we hereby design a novel framework for multi-phase splenic vascular injury segmentation, especially with limited data. On the one hand, we propose to leverage external data to mine pseudo splenic masks as the spatial attention, dubbed external attention , for guiding the segmentation of splenic vascular injury. On the other hand, we develop a synthetic phase augmentation module, which builds upon generative adversarial networks, for populating the internal data by fully leveraging the relation between different phases. By jointly enforcing external attention and populating internal data representation during training, our proposed method outperforms other competing methods and substantially improves the popular DeepLab-v3+ baseline by more than 7% in terms of average DSC, which confirms its effectiveness.",
            "https://arxiv.org/abs/2201.00942"
        )

         add_paper("CAKES: Channel-wise Automatic KErnel Shrinking for Efficient 3D Networks",
            "Qihang Yu, Yingwei Li, Jieru Mei, Yuyin Zhou, Alan Yuille",
            "AAAI, 2021",
            "https://arxiv.org/abs/2003.12798",
            "@inproceedings{yu2021cakes,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {CAKES: Channel-wise Automatic KErnel Shrinking for Efficient 3D Networks},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Yu, Qihang and Li, Yingwei and Mei, Jieru and Zhou, Yuyin and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {AAAI},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2021}<br>}",
            "3D Convolution Neural Networks (CNNs) have been widely applied to 3D scene understanding, such as video analysis and volumetric image recognition. However, 3D networks can easily lead to over-parameterization which incurs expensive computation cost. In this paper, we propose Channel-wise Automatic KErnel Shrinking (CAKES), to enable efficient 3D learning by shrinking standard 3D convolutions into a set of economic operations (e.g., 1D, 2D convolutions). Unlike previous methods, CAKES performs channel-wise kernel shrinkage, which enjoys the following benefits: 1) enabling operations deployed in every layer to be heterogeneous, so that they can extract diverse and complementary information to benefit the learning process; and 2) allowing for an efficient and flexible replacement design, which can be generalized to both spatial-temporal and volumetric data. Further, we propose a new search space based on CAKES, so that the configuration can be determined automatically for simplifying 3D networks. CAKES shows superior performance to other methods with similar model size, and it also achieves comparable performance to state-of-the-art methods with much fewer parameters and computational costs on tasks including 3D medical imaging segmentation and video action recognition. Codes and models are available at https://github.com/yucornetto/CAKES",
            "https://arxiv.org/abs/2003.12798",
            "https://github.com/yucornetto/CAKES"
        ) 

         add_paper("Added value of deep learning-based liver parenchymal CT volumetry for predicting major arterial injury after blunt hepatic trauma: a decision tree analysis",
            "David Dreizin, Tina Chen, Yuanyuan Liang, Yuyin Zhou, Fabio Paes, Yan Wang, Alan Yuille, Patrick Roth, Kathryn Champ, Guang Li, Ashley McLenithan, Jonathan J Morrison",
            "Abdominal Radiology, 2021",
            "https://link.springer.com/article/10.1007/s00261-020-02892-x",
            "@article{dreizin2021added,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Added value of deep learning-based liver parenchymal CT volumetry for predicting major arterial injury after blunt hepatic trauma: a decision tree analysis},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Dreizin, David and Chen, Tina and Liang, Yuanyuan and Zhou, Yuyin and Paes, Fabio and Wang, Yan and Yuille, Alan and Roth, Patrick and Champ, Kathryn and Li, Guang and others},<br>" +
             "&nbsp;&nbsp;&nbsp;journal   = {Abdominal Radiology},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2021}<br>}",
            "In patients presenting with blunt hepatic injury (BHI), the utility of CT for triage to hepatic angiography remains uncertain since simple binary assessment of contrast extravasation (CE) as being present or absent has only modest accuracy for major arterial injury on digital subtraction angiography (DSA). American Association for the Surgery of Trauma (AAST) liver injury grading is coarse and subjective, with limited diagnostic utility in this setting. Volumetric measurements of hepatic injury burden could improve prediction. We hypothesized that in a cohort of patients that underwent catheter-directed hepatic angiography following admission trauma CT, a deep learning quantitative visualization method that calculates % liver parenchymal disruption (the LPD index, or LPDI) would add value to CE assessment for prediction of major hepatic arterial injury (MHAI).",
            "https://link.springer.com/article/10.1007/s00261-020-02892-x"
        ) 
  
        add_paper("Shape-Texture Debiased Neural Network Training",
            "Yingwei Li, Qihang Yu, Mingxing Tan, Jieru Mei, Peng Tang, Wei Shen, Alan Yuille, Cihang Xie",
            "ICLR, 2021",
            "https://arxiv.org/abs/2010.05981",
            "@inproceedings{li2021shapetexture,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Shape-Texture Debiased Neural Network Training},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Li, Yingwei and Yu, Qihang and Tan, Mingxing and Mei, Jieru and Tang, Peng and Shen, Wei and Yuille, Alan and Xie, Cihang},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {ICLR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2021}<br>}",
            "Shape and texture are two prominent and complementary cues for recognizing objects. Nonetheless, Convolutional Neural Networks are often biased towards either texture or shape, depending on the training dataset. Our ablation shows that such bias degenerates model performance. Motivated by this observation, we develop a simple algorithm for shape-texture debiased learning. To prevent models from exclusively attending on a single cue in representation learning, we augment training data with images with conflicting shape and texture information (e.g., an image of chimpanzee shape but with lemon texture) and, most importantly, provide the corresponding supervisions from shape and texture simultaneously. Experiments show that our method successfully improves model performance on several image recognition benchmarks and adversarial robustness. For example, by training on ImageNet, it helps ResNet-152 achieve substantial improvements on ImageNet (+1.2%), ImageNet-A (+5.2%), ImageNet-C (+8.3%) and Stylized-ImageNet (+11.1%), and on defending against FGSM adversarial attacker on ImageNet (+14.4%). Our method also claims to be compatible with other advanced data augmentation strategies, e.g., Mixup and CutMix. The code is available here: https://github.com/LiYingwei/ ShapeTextureDebiasedTraining.",
            "https://arxiv.org/abs/2010.05981",
            "https://github.com/LiYingwei/ShapeTextureDebiasedTraining"
        )

        document.write("</ul>")
        document.write("<h1>2020</h1>")
        document.write("<ul>")


        add_paper("Towards Robust Representation Learning and Beyond",
            "Cihang Xie",
            "Ph.D. Dissertation, Johns Hopkins University",
            "https://jscholarship.library.jhu.edu/handle/1774.2/63790",
            "@phdthesis{xie2020phd,<br>" +
             "&nbsp;&nbsp;&nbsp;title   = {Towards Robust Representation Learning and Beyond},<br>" +
             "&nbsp;&nbsp;&nbsp;author  = {Xie, Cihang},<br>" +
             "&nbsp;&nbsp;&nbsp;year    = {2020},<br>" +
             "&nbsp;&nbsp;&nbsp;school  = {The Johns Hopkins University}<br>}",
            null,
            "https://jscholarship.library.jhu.edu/handle/1774.2/63790"
        )

        add_paper("Medical Machine Intelligence: Data-efficiency and Knowledge-awareness",
            "Yuyin Zhou",
            "Ph.D. Dissertation, Johns Hopkins University",
            "https://jscholarship.library.jhu.edu/handle/1774.2/63779",
            "@phdthesis{zhou2020medical,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Medical Machine Intelligence: Data-Efficiency and Knowledge-Awareness},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Zhou, Yuyin},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2020},<br>" +
             "&nbsp;&nbsp;&nbsp;school    = {The Johns Hopkins University}<br>}",
            null,
            "https://jscholarship.library.jhu.edu/handle/1774.2/63779"
        )




        add_paper("PatchAttack: A Black-box Texture-based Attack with Reinforcement Learning",
            "Chenglin Yang, Adam Kortylewski, Cihang Xie, Yinzhi Cao, Alan Yuille",
            "ECCV, 2020",
            "https://arxiv.org/abs/2004.05682",
            "@inproceedings{yang2020patchattack,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {PatchAttack: A Black-box Texture-based Attack with Reinforcement Learning},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Chenglin Yang, Adam Kortylewski, Cihang Xie, Yinzhi Cao and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {ECCV},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2020}<br>}",
            "Patch-based attacks introduce a perceptible but localized change to the input that induces misclassification. A limitation of current patch-based black-box attacks is that they perform poorly for targeted attacks, and even for the less challenging non-targeted scenarios, they require a large number of queries. Our proposed PatchAttack is query efficient and can break models for both targeted and non-targeted attacks. PatchAttack induces misclassifications by superimposing small textured patches on the input image. We parametrize the appearance of these patches by a dictionary of class-specific textures. This texture dictionary is learned by clustering Gram matrices of feature activations from a VGG backbone. PatchAttack optimizes the position and texture parameters of each patch using reinforcement learning. Our experiments show that PatchAttack achieves >99% success rate on ImageNet for a wide range of architectures, while only manipulating 3% of the image for non-targeted attacks and 10% on average for targeted attacks. Furthermore, we show that PatchAttack circumvents state-of-the-art adversarial defense methods successfully. The code is publicly available here: https://github.com/Chenglin-Yang/PatchAttack.",
            "https://arxiv.org/abs/2004.05682",
            "https://github.com/Chenglin-Yang/PatchAttack"
        )

        add_paper("Regional Homogeneity: Towards Learning Transferable Universal Adversarial Perturbations Against Defenses",
            "Yingwei Li, Song Bai, Cihang Xie, Zhenyu Liao, Xiaohui Shen, Alan Yuille",
            "ECCV, 2020",
            "https://arxiv.org/abs/1904.00979",
            "@inproceedings{li2019regional,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Regional Homogeneity: Towards Learning Transferable Universal Adversarial Perturbations Against Defenses},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Li, Yingwei and Bai, Song and Xie, Cihang and Liao, Zhenyu and Shen, Xiaohui and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {ECCV},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2020}<br>}",
            "This paper focuses on learning transferable adversarial examples specifically against defense models (models to defense adversarial attacks). In particular, we show that a simple universal perturbation can fool a series of state-of-the-art defenses.Adversarial examples generated by existing attacks are generally hard to transfer to defense models. We observe the property of regional homogeneity in adversarial perturbations and suggest that the defenses are less robust to regionally homogeneous perturbations. Therefore, we propose an effective transforming paradigm and a customized gradient transformer module to transform existing perturbations into regionally homogeneous ones. Without explicitly forcing the perturbations to be universal, we observe that a well-trained gradient transformer module tends to output input-independent gradients (hence universal) benefiting from the under-fitting phenomenon. Thorough experiments demonstrate that our work significantly outperforms the prior art attacking algorithms (either image-dependent or universal ones) by an average improvement of 14.0% when attacking 9 defenses in the transfer-based attack setting. In addition to the cross-model transferability, we also verify that regionally homogeneous perturbations can well transfer across different vision tasks (attacking with the semantic segmentation task and testing on the object detection task). The code is available here: https://github.com/LiYingwei/Regional-Homogeneity.",
            "https://arxiv.org/abs/1904.00979",
            "https://github.com/LiYingwei/Regional-Homogeneity"
        )

        add_paper("Adversarial Examples Improve Image Recognition",
            "Cihang Xie, Mingxing Tan, Boqing Gong, Jiang Wang, Alan Yuille, Quoc Le",
            "CVPR, 2020",
            "https://arxiv.org/abs/1911.09665",
            "@inproceedings{xie2020adversarial,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Adversarial Examples Improve Image Recognition},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Xie, Cihang and Tan, Mingxing and Gong, Boqing and Wang, Jiang and Yuille, Alan and Le, Quoc V},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {CVPR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2020}<br>}",
            "Adversarial examples are commonly viewed as a threat to ConvNets. Here we present an opposite perspective: adversarial examples can be used to improve image recognition models if harnessed in the right manner. We propose AdvProp, an enhanced adversarial training scheme which treats adversarial examples as additional examples, to prevent overfitting. Key to our method is the usage of a separate auxiliary batch norm for adversarial examples, as they have different underlying distributions to normal examples. We show that AdvProp improves a wide range of models on various image recognition tasks and performs better when the models are bigger. For instance, by applying AdvProp to the latest EfficientNet-B7 [28] on ImageNet, we achieve significant improvements on ImageNet (+0.7%), ImageNet-C (+6.5%), ImageNet-A (+7.0%), Stylized-ImageNet (+4.8%). With an enhanced EfficientNet-B8, our method achieves the state-of-the-art 85.5% ImageNet top-1 accuracy without extra data. This result even surpasses the best model in [20] which is trained with 3.5B Instagram images ( 3000X more than ImageNet) and 9.4X more parameters. Code and models will be made publicly available.",
            "https://arxiv.org/abs/1911.09665",
            "https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet",
            "https://medium.com/syncedreview/google-johns-hopkins-university-can-adversarial-examples-improve-image-recognition-bcb7254e2d8",
        )

        add_paper("Neural Architecture Search for Lightweight Non-Local Networks",
            "Yingwei Li, Xiaojie Jin, Jieru Mei, Xiaochen Lian, Linjie Yang, Cihang Xie, Qihang Yu, Yuyin Zhou, Song Bai, Alan Yuille",
            "CVPR, 2020",
            "https://arxiv.org/abs/2004.01961",
            "@inproceedings{li2020nas,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Neural Architecture Search for Lightweight Non-local Networks},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Li, Yingwei and Jin, Xiaojie and Mei, Jieru and Lian, Xiaochen and Yang, Linjie and Xie, Cihang and Yu, Qihang and Zhou, Yuyin and Bai, Song and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {CVPR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2020}<br>}",
            "Non-Local (NL) blocks have been widely studied in various vision tasks. However, it has been rarely explored to embed the NL blocks in mobile neural networks, mainly due to the following challenges: 1) NL blocks generally have heavy computation cost which makes it difficult to be applied in applications where computational resources are limited, and 2) it is an open problem to discover an optimal configuration to embed NL blocks into mobile neural networks. We propose AutoNL to overcome the above two obstacles. Firstly, we propose a Lightweight Non-Local (LightNL) block by squeezing the transformation operations and incorporating compact features. With the novel design choices, the proposed LightNL block is 400 times computationally cheaper than its conventional counterpart without sacrificing the performance. Secondly, by relaxing the structure of the LightNL block to be differentiable during training, we propose an efficient neural architecture search algorithm to learn an optimal configuration of LightNL blocks in an end-to-end manner. Notably, using only 32 GPU hours, the searched AutoNL model achieves 77.7% top-1 accuracy on ImageNet under a typical mobile setting (350M FLOPs), significantly outperforming previous mobile models including MobileNetV2 (+5.7%), FBNet (+2.8%) and MnasNet (+2.1%). Code and models are available at https://github.com/LiYingwei/AutoNL.",
            "https://arxiv.org/abs/2004.01961",
            "https://github.com/LiYingwei/AutoNL"
        )

        add_paper("Universal Physical Camouflage Attacks on Object Detectors",
            "Lifeng Huang, Chengying Gao, Yuyin Zhou, Cihang Xie, Alan Yuille, Changqing Zou, Ning Liu",
            "CVPR, 2020",
            "https://arxiv.org/abs/1909.04326",
            "@inproceedings{Huang2020UPC,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Universal Physical Camouflage Attacks on Object Detectors},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Lifeng Huang and Chengying Gao and Yuyin Zhou and Cihang Xie and Alan Yuille and Changqing Zou and Ning Liu},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {CVPR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2020}<br>}",
            "In this paper, we study physical adversarial attacks on object detectors in the wild. Previous works mostly craft instance-dependent perturbations only for rigid or planar objects. To this end, we propose to learn an adversarial pattern to effectively attack all instances belonging to the same object category, referred to as Universal Physical Camouflage Attack (UPC). Concretely, UPC crafts camouflage by jointly fooling the region proposal network, as well as misleading the classifier and the regressor to output errors. In order to make UPC effective for non-rigid or non-planar objects, we introduce a set of transformations for mimicking deformable properties. We additionally impose optimization constraint to make generated patterns look natural to human observers. To fairly evaluate the effectiveness of different physical-world attacks, we present the first standardized virtual database, AttackScenes, which simulates the real 3D world in a controllable and reproducible environment. Extensive experiments suggest the superiority of our proposed UPC compared with existing physical adversarial attackers not only in virtual environments (AttackScenes), but also in real-world physical environments.",
            "https://arxiv.org/abs/1909.04326",
            "https://mesunhlf.github.io/index_physical.html"
        )

        add_paper("Deep Distance Transform for Tubular Structure Segmentation in CT Scans",
            "Yan Wang, Xu Wei, Fengze Liu, Jieneng Chen, Yuyin Zhou, Wei Shen, Elliot Fishman Alan Yuille",
            "CVPR,2020",
            "https://arxiv.org/abs/1912.03383",
            "@inproceedings{wang2020deep,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Deep distance transform for tubular structure segmentation in ct scans},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Wang, Yan and Wei, Xu and Liu, Fengze and Chen, Jieneng and Zhou, Yuyin and Shen, Wei and Fishman, Elliot K and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {CVPR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2020}<br>}",
            "Tubular structure segmentation in medical images, e.g., segmenting vessels in CT scans, serves as a vital step in the use of computers to aid in screening early stages of related diseases. But automatic tubular structure segmentation in CT scans is a challenging problem, due to issues such as poor contrast, noise and complicated background. A tubular structure usually has a cylinder-like shape which can be well represented by its skeleton and cross-sectional radii (scales). Inspired by this, we propose a geometry-aware tubular structure segmentation method, Deep Distance Transform (DDT), which combines intuitions from the classical distance transform for skeletonization and modern deep segmentation networks. DDT first learns a multi-task network to predict a segmentation mask for a tubular structure and a distance map. Each value in the map represents the distance from each tubular structure voxel to the tubular structure surface. Then the segmentation mask is refined by leveraging the shape prior reconstructed from the distance map. We apply our DDT on six medical image datasets. Results show that (1) DDT can boost tubular structure segmentation performance significantly (e.g., over 13% DSC improvement for pancreatic duct segmentation), and (2) DDT additionally provides a geometrical measurement for a tubular structure, which is important for clinical diagnosis (e.g., the cross-sectional scale of a pancreatic duct can be an indicator for pancreatic cancer).",
            "https://arxiv.org/abs/1912.03383"
        )


        add_paper("A multiscale deep learning method for quantitative visualization of traumatic hemoperitoneum at CT: assessment of feasibility and comparison with subjective categorical estimation",
            "David Dreizin, Yuyin Zhou, Shuhao Fu, Yan Wang, Guang Li, Kathryn Champ, Eliot Siegel, Ze Wang, Tina Chen, Alan Yuille",
            "Radiology: Artificial Intelligence, 2020",
            "https://pubmed.ncbi.nlm.nih.gov/33330848",
            "@article{dreizin2020multiscale,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {A multiscale deep learning method for quantitative visualization of traumatic hemoperitoneum at CT: assessment of feasibility and comparison with subjective categorical estimation},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Dreizin, David and Zhou, Yuyin and Fu, Shuhao and Wang, Yan and Li, Guang and Champ, Kathryn and Siegel, Eliot and Wang, Ze and Chen, Tina and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;journal   = {Radiology: Artificial Intelligence},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2020}<br>}",
            "To evaluate the feasibility of a multiscale deep learning algorithm for quantitative visualization and measurement of traumatic hemoperitoneum and to compare diagnostic performance for relevant outcomes with categorical estimation.",
            "https://pubmed.ncbi.nlm.nih.gov/33330848/"
        )

        add_paper("Adversarial metric attack and defense for person re-identification",
            "Song Bai, Yingwei Li, Yuyin Zhou, Qizhu Li, Philip HS Torr",
            "IEEE TPAMI, 2020",
             "https://arxiv.org/abs/1901.10650",
            "@article{bai2020adversarial,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Adversarial metric attack and defense for person re-identification},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Bai, Song and Li, Yingwei and Zhou, Yuyin and Li, Qizhu and Torr, Philip HS},<br>" +
             "&nbsp;&nbsp;&nbsp;journal   = {IEEE Transactions on Pattern Analysis and Machine Intelligence},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2020}<br>}",
            "Person re-identification (re-ID) has attracted much attention recently due to its great importance in video surveillance. In general, distance metrics used to identify two person images are expected to be robust under various appearance changes. However, our work observes the extreme vulnerability of existing distance metrics to adversarial examples, generated by simply adding human-imperceptible perturbations to person images. Hence, the security danger is dramatically increased when deploying commercial re-ID systems in video surveillance. Although adversarial examples have been extensively applied for classification analysis, it is rarely studied in metric analysis like person re-identification. The most likely reason is the natural gap between the training and testing of re-ID networks, that is, the predictions of a re-ID network cannot be directly used during testing without an effective metric. In this work, we bridge the gap by proposing Adversarial Metric Attack, a parallel methodology to adversarial classification attacks. Comprehensive experiments clearly reveal the adversarial effects in re-ID systems. Meanwhile, we also present an early attempt of training a metric-preserving network, thereby defending the metric against adversarial attacks. At last, by benchmarking various adversarial settings, we expect that our work can facilitate the development of adversarial attack and defense in metric-based applications.",
            "https://arxiv.org/abs/1901.10650"
        )

        add_paper("Intriguing Properties of Adversarial Training at Scale",
            "Cihang Xie, Alan Yuille",
            "ICLR, 2020",
            "https://arxiv.org/abs/1906.03787",
            "@inproceedings{Xie2020intriguing,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Intriguing Properties of Adversarial Training at Scale},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Cihang Xie and Alan Yuille},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {ICLR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2020}<br>}",
            "Adversarial training is one of the main defenses against adversarial attacks. In this paper, we provide the first rigorous study on diagnosing elements of large-scale adversarial training on ImageNet, which reveals two intriguing properties. First, we study the role of normalization. Batch Normalization (BN) is a crucial element for achieving state-of-the-art performance on many vision tasks, but we show it may prevent networks from obtaining strong robustness in adversarial training. One unexpected observation is that, for models trained with BN, simply removing clean images from training data largely boosts adversarial robustness, i.e., 18.3%. We relate this phenomenon to the hypothesis that clean images and adversarial images are drawn from two different domains. This two-domain hypothesis may explain the issue of BNwhentraining with amixtureof cleanandadversarial images, as estimating normalization statistics of this mixture distribution is challenging. Guided by this two-domain hypothesis, we show disentangling the mixture distribution for normalization, i.e., applying separate BNs to clean and adversarial images for statistics estimation, achieves much stronger robustness. Additionally, we find that enforcing BNs to behave consistently at training and testing can further enhance robustness. Second, we study the role of network capacity. We find our so-called “deep” networks are still shallow for the task of adversarial learning. Unlike traditional classification tasks where accuracy is only marginally improved by adding more layers to “deep” networks (e.g., ResNet-152), adversarial training exhibits a much stronger demand on deeper networks to achieve higher adversarial robustness. This robustness improvement can be observed substantially and consistently even by pushing the network capacity to an unprecedented scale, i.e., ResNet-638.",
            "https://arxiv.org/abs/1906.03787"
        )

        add_paper("Learning Transferable Adversarial Examples via Ghost Networks",
            "Yingwei Li, Song Bai, Yuyin Zhou, Cihang Xie, Zhishuai Zhang, Alan Yuille",
            "AAAI, 2020",
            "https://arxiv.org/abs/1812.03413",
            "@inproceedings{li2020learning,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Learning transferable adversarial examples via ghost networks},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Li, Yingwei and Bai, Song and Zhou, Yuyin and Xie, Cihang and Zhang, Zhishuai and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {AAAI},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2020}<br>}",
            "Recent development of adversarial attacks has proven that ensemble-based methods outperform traditional, non-ensemble ones in black-box attack. However, as it is computationally prohibitive to acquire a family of diverse models, these methods achieve inferior performance constrained by the limited number of models to be ensembled.In this paper, we propose Ghost Networks to improve the transferability of adversarial examples. The critical principle of ghost networks is to apply feature-level perturbations to an existing model to potentially create a huge set of diverse models. After that, models are subsequently fused by longitudinal ensemble. Extensive experimental results suggest that the number of networks is essential for improving the transferability of adversarial examples, but it is less necessary to independently train different networks and ensemble them in an intensive aggregation way. Instead, our work can be used as a computationally cheap and easily applied plug-in to improve adversarial approaches both in single-model and multi-model attack, compatible with residual and non-residual networks. By reproducing the NeurIPS 2017 adversarial competition, our method outperforms the No.1 attack submission by a large margin, demonstrating its effectiveness and efficiency. Code is available at https://github.com/LiYingwei/ghost-network.",
            "https://arxiv.org/abs/1812.03413",
        )


        add_paper("Detecting Pancreatic Adenocarcinoma in Multi-phase CT Scans via Alignment Ensemble",
            "Yingda Xia, Qihang Yu, Wei Shen, Yuyin Zhou, Elliot Fishman",
            "MICCAI, 2020",
            "https://arxiv.org/abs/2003.08441",
            "@inproceedings{xia2020detecting,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Detecting pancreatic ductal adenocarcinoma in multi-phase CT scans via alignment ensemble},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Xia, Yingda and Yu, Qihang and Shen, Wei and Zhou, Yuyin and Fishman, Elliot K and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {MICCAI},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2020}<br>}",
            "Pancreatic ductal adenocarcinoma (PDAC) is one of the most lethal cancers among the population. Screening for PDACs in dynamic contrast-enhanced CT is beneficial for early diagnosis. In this paper, we investigate the problem of automated detecting PDACs in multi-phase (arterial and venous) CT scans. Multiple phases provide more information than single phase, but they are unaligned and inhomogeneous in texture, making it difficult to combine cross-phase information seamlessly. We study multiple phase alignment strategies, i.e., early alignment (image registration), late alignment (high-level feature registration), and slow alignment (multi-level feature registration), and suggest an ensemble of all these alignments as a promising way to boost the performance of PDAC detection. We provide an extensive empirical evaluation on two PDAC datasets and show that the proposed alignment ensemble significantly outperforms previous state-of-the-art approaches, illustrating the strong potential for clinical use.",
            "https://arxiv.org/abs/2003.08441"
        )

        add_paper("Domain Adaptive Relational Reasoning for 3D Multi-Organ Segmentation",
            "Shuhao Fu, Yongyi Lu, Yan Wang, Yuyin Zhou, Wei Shen, Elliot Fishman, Alan Yuille",
            "MICCAI, 2020",
            "https://arxiv.org/abs/2005.09120",
            "@inproceedings{fu2020domain,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Domain adaptive relational reasoning for 3d multi-organ segmentation},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Fu, Shuhao and Lu, Yongyi and Wang, Yan and Zhou, Yuyin and Shen, Wei and Fishman, Elliot and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {MICCAI},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2020}<br>}",
            "In this paper, we present a novel unsupervised domain adaptation (UDA) method, named Domain Adaptive Relational Reasoning (DARR), to generalize 3D multi-organ segmentation models to medical data collected from different scanners and/or protocols (domains). Our method is inspired by the fact that the spatial relationship between internal structures in medical images is relatively fixed, e.g., a spleen is always located at the tail of a pancreas, which serves as a latent variable to transfer the knowledge shared across multiple domains. We formulate the spatial relationship by solving a jigsaw puzzle task, i.e., recovering a CT scan from its shuffled patches, and jointly train it with the organ segmentation task. To guarantee the transferability of the learned spatial relationship to multiple domains, we additionally introduce two schemes: 1) Employing a super-resolution network also jointly trained with the segmentation model to standardize medical images from different domain to a certain spatial resolution; 2) Adapting the spatial relationship for a test image by test-time jigsaw puzzle training. Experimental results show that our method improves the performance by 29.60% DSC on target datasets on average without using any data from the target domain during training.",
            "https://arxiv.org/abs/2005.09120"
        )

        add_paper("Deep learning-based quantitative visualization and measurement of extraperitoneal hematoma volumes in patients with pelvic fractures",
            "David Dreizin, Yuyin Zhou, Tina Chen, Guang Li, Alan Yuille , Ashley McLenithan, Jonathan Morrison",
            "Journal of Trauma and Acute Care Surgery, 2020",
            "https://pubmed.ncbi.nlm.nih.gov/32107356/",
            "@article{dreizin2020deep,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Deep learning-based quantitative visualization and measurement of extraperitoneal hematoma volumes in patients with pelvic fractures: potential role in personalized forecasting and decision support},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Dreizin, David and Zhou, Yuyin and Chen, Tina and Li, Guang and Yuille, Alan and McLenithan, Ashley and Morrison, Jonathan J},<br>" +
             "&nbsp;&nbsp;&nbsp;journal   = {Journal of Trauma and Acute Care Surgery}<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2020}<br>}" ,
            "Admission CT is a widely used diagnostic tool for patients with pelvic fractures. In this pilot study, we hypothesized that pelvic hematoma volumes derived using a rapid automated deep learning-based quantitative visualization and measurement algorithm predict interventions and outcomes including a) need for angioembolization, pelvic packing, or massive transfusion, and b) in-hospital mortality.<br>We performed a single-institution retrospective analysis of 253 patients with bleeding pelvic fractures who underwent admission abdominopelvic trauma CT between 2008–2017. Included patients had hematoma volumes ≥ 30 mL, were ≥ 18 years old, and underwent contrast-enhanced CT prior to surgical or angiographic intervention. Automated pelvic hematoma volume measurements were previously derived using a deep-learning (DL) quantitative visualization and measurement algorithm through cross-validation. A composite dependent variable of need for massive transfusion, angioembolization (AE), or pelvic packing (PP) was employed as the primary endpoint. The added utility of hematoma volume was assessed by comparing the performance of multivariable models with and without hematoma volume as a predictor. AUCs as well as sensitivities, specificities, and predictive values were determined at clinically relevant thresholds. Adjusted odds ratios (OR) of automated pelvic hematoma volumes at 200 mL increments were derived.<br>Median age was 47 [IQR 29, 61], and 70% of patients were male. Median ISS was 22 [14, 36]. 94% of patients had injuries in other body regions and 73% had polytrauma (ISS ≥ 16). 33% had Tile/OTA type B and 24% had type C pelvic fractures. 109 patients underwent AE, 22 underwent PP, and 53 received massive transfusion. A total of 123 patients received all three interventions. 16 patients died during hospitalization from causes other than untreatable (AIS 6) head injury. Variables incorporated into multivariable models included age, gender, Tile/OTA grade, admission lactate, HR, and SBP. Addition of hematoma volume resulted in a significant improvement in model performance, with AUC for the composite outcome (AE, PP, or massive transfusion) increasing from 0.74 to 0.83 (p < 0.001). Adjusted unit odds more than doubled for every additional 200 mL of hematoma volume. Incraese in model AUC for mortality with incorporation of hematoma volume was not statistically significant (0.85 versus 0.90, p = 0.12).<br>Hematoma volumes measured using a rapid automated deep learning algorithm improved prediction of need for AE, PP, or massive transfusion. Simultaneous automated measurement of multiple sources of bleeding at CT could augment outcome prediction in trauma patients.",
            "https://pubmed.ncbi.nlm.nih.gov/32107356/"
        )



        add_paper("Recurrent Saliency Transformation Network for Tiny Target Segmentation in Abdominal CT Scans",
            "Lingxi Xie, Qihang Yu, Yuyin Zhou, Yan Wang, Elliot Fishman, Alan Yuille",
            "IEEE TMI, 2020",
            "https://ieeexplore.ieee.org/abstract/document/8769868",
            "@article{xie2019recurrent,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Recurrent saliency transformation network for tiny target segmentation in abdominal CT scans},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Xie, Lingxi and Yu, Qihang and Zhou, Yuyin and Wang, Yan and Fishman, Elliot K and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;journal   = {IEEE Transactions on Medical Imaging},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2019}<br>}",
            "We aim at segmenting a wide variety of organs, including tiny targets (e.g., adrenal gland), and neoplasms (e.g., pancreatic cyst), from abdominal CT scans. This is a challenging task in two aspects. First, some organs (e.g., the pancreas), are highly variable in both anatomy and geometry, and thus very difficult to depict. Second, the neoplasms often vary a lot in its size, shape, as well as its location within the organ. Third, the targets (organs and neoplasms) can be considerably small compared to the human body, and so standard deep networks for segmentation are often less sensitive to these targets and thus predict less accurately especially around their boundaries. In this paper, we present an end-to-end framework named recurrent saliency transformation network (RSTN) for segmenting tiny and/or variable targets. The RSTN is a coarse-to-fine approach that uses prediction from the first (coarse) stage to shrink the input region for the second (fine) stage. A saliency transformation module is inserted between these two stages so that 1) the coarse-scaled segmentation mask can be transferred as spatial weights and applied to the fine stage and 2) the gradients can be back-propagated from the loss layer to the entire network so that the two stages are optimized in a joint manner. In the testing stage, we perform segmentation iteratively to improve accuracy. In this extended journal paper, we allow a gradual optimization to improve the stability of the RSTN, and introduce a hierarchical version named H-RSTN to segment tiny and variable neoplasms such as pancreatic cysts. Experiments are performed on several CT datasets including a public pancreas segmentation dataset, our own multi-organ dataset, and a cystic pancreas dataset. In all these cases, the RSTN outperforms the baseline (a stage-wise coarse-to-fine approach) significantly. Confirmed by the radiologists in our team, these promising segmentation results can help early diagnosis of pancreatic cancer. The code and pre-trained models of our project were made available at https://github.com/198808xc/OrganSegRSTN",
            "https://ieeexplore.ieee.org/abstract/document/8769868",
            "https://github.com/198808xc/OrganSegRSTN"
        )

        document.write("</ul>")
        document.write("<h1>2019</h1>")
        document.write("<ul>")


        add_paper("Feature Denoising for Improving Adversarial Robustness",
            "Cihang Xie, Yuxin Wu, Laurens van der Maaten, Alan Yuille, Kaiming He",
            "CVPR, 2019",
            "https://arxiv.org/abs/1812.03411",
            "@inproceedings{xie2019feature,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Feature denoising for improving adversarial robustness},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Xie, Cihang and Wu, Yuxin and Maaten, Laurens van der and Yuille, Alan and He, Kaiming},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {CVPR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2019}<br>}",
            "Adversarial attacks to image classification systems present challenges to convolutional networks and opportunities for understanding them. This study suggests that adversarial perturbations on images lead to noise in the features constructed by these networks. Motivated by this observation, we develop new network architectures that increase adversarial robustness by performing feature denoising. Specifically, our networks contain blocks that denoise the features using non-local means or other filters; the entire networks are trained end-to-end. When combined with adversarial training, our feature denoising networks substantially improve the state-of-the-art in adversarial robustness in both white-box and black-box attack settings. On ImageNet, under 10-iteration PGD white-box attacks where prior art has 27.9% accuracy, our method achieves 55.7%; even under extreme 2000-iteration PGD white-box attacks, our method secures 42.6% accuracy. Our method was ranked first in Competition on Adversarial Attacks and Defenses (CAAD) 2018 --- it achieved 50.6% classification accuracy on a secret, ImageNet-like test dataset against 48 unknown attackers, surpassing the runner-up approach by 10%. Code is available at https://github.com/facebookresearch/ImageNet-Adversarial-Training.",
            "https://arxiv.org/abs/1812.03411",
            "https://github.com/facebookresearch/ImageNet-Adversarial-Training",
        )

        add_paper("Improving Transferability of Adversarial Examples with Input Diversity",
            "Cihang Xie, Zhishuai Zhang, Yuyin Zhou, Song Bai, Jianyu Wang, Zhou Ren, Alan Yuille",
            "CVPR, 2019",
            "https://arxiv.org/abs/1803.06978",
            "@inproceedings{xie2019improving,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Improving transferability of adversarial examples with input diversity},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Xie, Cihang and Zhang, Zhishuai and Zhou, Yuyin and Bai, Song and Wang, Jianyu and Ren, Zhou and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {CVPR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2019}<br>}",
            "Though CNNs have achieved the state-of-the-art performance on various vision tasks, they are vulnerable to adversarial examples --- crafted by adding human-imperceptible perturbations to clean images. However, most of the existing adversarial attacks only achieve relatively low success rates under the challenging black-box setting, where the attackers have no knowledge of the model structure and parameters. To this end, we propose to improve the transferability of adversarial examples by creating diverse input patterns. Instead of only using the original images to generate adversarial examples, our method applies random transformations to the input images at each iteration. Extensive experiments on ImageNet show that the proposed attack method can generate adversarial examples that transfer much better to different networks than existing baselines. By evaluating our method against top defense solutions and official baselines from NIPS 2017 adversarial competition, the enhanced attack reaches an average success rate of 73.0%, which outperforms the top-1 attack submission in the NIPS competition by a large margin of 6.6%. We hope that our proposed attack strategy can serve as a strong benchmark baseline for evaluating the robustness of networks to adversaries and the effectiveness of different defense methods in the future. Code is available at https://github.com/cihangxie/DI-2-FGSM.",
            "https://arxiv.org/abs/1803.06978",
            "https://github.com/cihangxie/DI-2-FGSM"
        )


        add_paper("Performance of a Deep Learning Algorithm for Automated Segmentation and Quantification of Traumatic Pelvic Hematomas on CT",
            "David Dreizin, Yuyin Zhou, Yixiao Zhang, Nikki Tirada, Alan Yuille",
            "Journal of Digital Imaging, 2019",
            "https://pubmed.ncbi.nlm.nih.gov/31172331/",
            "@article{dreizin2020performance,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Performance of a deep learning algorithm for automated segmentation and quantification of traumatic pelvic hematomas on CT},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Dreizin, David and Zhou, Yuyin and Zhang, Yixiao and Tirada, Nikki and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;journal   = {Journal of Digital Imaging},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2020}<br>}",
            "The volume of pelvic hematoma at CT has been shown to be the strongest independent predictor of major arterial injury requiring angioembolization in trauma victims with pelvic fractures, and also correlates with transfusion requirement and mortality. Measurement of pelvic hematomas (unopacified extraperitoneal blood accumulated from time of injury) using semi-automated seeded region growing is time-consuming and requires trained experts, precluding routine measurement at the point of care. Pelvic hematomas are markedly variable in shape and location, have irregular ill-defined margins, have low contrast with respect to viscera and muscle, and reside within anatomically distorted pelvises. Furthermore, pelvic hematomas occupy a small proportion of the entire volume of a chest, abdomen, and pelvis (C/A/P) trauma CT. The challenges are many, and no automated methods for segmentation and volumetric analysis have been described to date. Traditional approaches using fully convolutional networks result in coarse segmentations and class imbalance with suboptimal convergence. In this study, we implement a modified coarse-to-fine deep learning approach—the Recurrent Saliency Transformation Network (RSTN) for pelvic hematoma volume segmentation. RSTN previously yielded excellent results in pancreas segmentation, where low contrast with adjacent structures, small target volume, variable location, and fine contours are also problematic. We have curated a unique single-institution corpus of 253 C/A/P admission trauma CT studies in patients with bleeding pelvic fractures with manually labeled pelvic hematomas. We hypothesized that RSTN would result in sufficiently high Dice similarity coefficients to facilitate accurate and objective volumetric measurements for outcome prediction (arterial injury requiring angioembolization). Cases were separated into five combinations of training and test sets in an 80/20 split and fivefold cross-validation was performed. Dice scores in the test set were 0.71 (SD ± 0.10) using RSTN, compared to 0.49 (SD ± 0.16) using a baseline Deep Learning Tool Kit (DLTK) reference 3D U-Net architecture. Mean inference segmentation time for RSTN was 0.90 min (± 0.26). Pearson correlation between predicted and manual labels was 0.95 with p < 0.0001. Measurement bias was within 10 mL. AUC of hematoma volumes for predicting need for angioembolization was 0.81 (predicted) versus 0.80 (manual). Qualitatively, predicted labels closely followed hematoma contours and avoided muscle and displaced viscera. Further work will involve validation using a federated dataset and incorporation into a predictive model using multiple segmented features.",
            "https://pubmed.ncbi.nlm.nih.gov/31172331/"
        )

        add_paper("Lesion detection by efficiently bridging 3D context",
            "Zhishuai Zhang, Yuyin Zhou, Wei Shen, Elliot Fishman, Alan Yuille",
            "MICCAI Workshop on Machine Learning in Medical Imaging, 2019",
            "https://link.springer.com/chapter/10.1007/978-3-030-32692-0_54",
            "@inproceedings{zhang2019lesion,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Lesion detection by efficiently bridging 3D context},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Zhang, Zhishuai and Zhou, Yuyin and Shen, Wei and Fishman, Elliot and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {MICCAI Workshop on Machine Learning in Medical Imaging},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2019}<br>}",
            "Lesion detection in CT (computed tomography) scan images is an important yet challenging task due to the low contrast of soft tissues and similar appearance between lesion and the background. Exploiting 3D context information has been studied extensively to improve detection accuracy. However, previous methods either use a 3D CNN which usually requires a sliding window strategy to inference and only acts on local patches; or simply concatenate feature maps of independent 2D CNNs to obtain 3D context information, which is less effective to capture 3D knowledge. To address these issues, we design a hybrid detector to combine benefits from both of the above methods. We propose to build several light-weighted 3D CNNs as subnets to bridge 2D CNNs’ intermediate features, so that 2D CNNs are connected with each other which interchange 3D context information while feed-forwarding. Comprehensive experiments in DeepLesion dataset show that our method can combine 3D knowledge effectively and provide higher quality backbone features. Our detector surpasses the current state-of-the-art by a large margin with comparable speed and GPU memory consumption.",
            "https://link.springer.com/chapter/10.1007/978-3-030-32692-0_54"
        )

        add_paper("FusionNet: Incorporating Shape and Texture for Abnormality Detection in 3D Abdominal CT Scans",
            "Fengze Liu, Yuyin Zhou, Elliot Fishman, Alan Yuille",
            "MICCAI Workshop on Machine Learning in Medical Imaging, 2019",
            "https://link.springer.com/chapter/10.1007/978-3-030-32692-0_26",
            "@inproceedings{liu2019fusionnet,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {FusionNet: Incorporating Shape and Texture for Abnormality Detection in 3D Abdominal CT Scans},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Liu, Fengze and Zhou, Yuyin and Fishman, Elliot and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {MICCAI Workshop on Machine Learning in Medical Imaging},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2019}<br>}",
            "Automatic abnormality detection in abdominal CT scans can help doctors improve the accuracy and efficiency in diagnosis. In this paper we aim at detecting pancreatic ductal adenocarcinoma (PDAC), the most common pancreatic cancer. Taking the fact that the existence of tumor can affect both the shape and the texture of pancreas, we design a system to extract the shape and texture feature at the same time for detecting PDAC. In this paper we propose a two-stage method for this 3D classification task. First, we segment the pancreas into a binary mask. Second, a FusionNet is proposed to take both the binary mask and CT image as input and perform a binary classification. The optimal architecture of the FusionNet is obtained by searching a pre-defined functional space. We show that the classification results using either shape or texture information are complementary, and by fusing them with the optimized architecture, the performance improves by a large margin. Our method achieves a specificity of 97% and a sensitivity of 92% on 200 normal scans and 136 scans with PDAC.",
            "https://link.springer.com/chapter/10.1007/978-3-030-32692-0_26"
        )

        add_paper("Multi-scale attentional network for multi-focal segmentation of active bleed after pelvic fractures",
            "Yuyin Zhou, David Dreizin, Yingwei Li, Zhishuai Zhang, Yan Wang, Alan Yuille",
            "MICCAI Workshop on Machine Learning in Medical Imaging, 2019",
            "https://link.springer.com/chapter/10.1007/978-3-030-32692-0_53",
            "@inproceedings{zhou2019multi,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Multi-scale attentional network for multi-focal segmentation of active bleed after pelvic fractures},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Zhou, Yuyin and Dreizin, David and Li, Yingwei and Zhang, Zhishuai and Wang, Yan and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {MICCAI Workshop on Machine Learning in Medical Imaging},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2019}<br>}",
            "Trauma is the worldwide leading cause of death and disability in those younger than 45 years, and pelvic fractures are a major source of morbidity and mortality. Automated segmentation of multiple foci of arterial bleeding from abdominopelvic trauma CT could provide rapid objective measurements of the total extent of active bleeding, potentially augmenting outcome prediction at the point of care, while improving patient triage, allocation of appropriate resources, and time to definitive intervention. In spite of the importance of active bleeding in the quick tempo of trauma care, the task is still quite challenging due to the variable contrast, intensity, location, size, shape, and multiplicity of bleeding foci. Existing work presents a heuristic rule-based segmentation technique which requires multiple stages and cannot be efficiently optimized end-to-end. To this end, we present, Multi-Scale Attentional Network (MSAN), the first yet reliable end-to-end network, for automated segmentation of active hemorrhage from contrast-enhanced trauma CT scans. MSAN consists of the following components: (1) an encoder which fully integrates the global contextual information from holistic 2D slices; (2) a multi-scale strategy applied both in the training stage and the inference stage to handle the challenges induced by variation of target sizes; (3) an attentional module to further refine the deep features, leading to better segmentation quality; and (4) a multi-view mechanism to leverage the 3D information. MSAN reports a significant improvement of more than 7% compared to prior arts in terms of DSC.",
            "https://link.springer.com/chapter/10.1007/978-3-030-32692-0_53"
        )

        add_paper("Application of deep learning to pancreatic cancer detection: lessons learned from our initial experience",
            "Linda C Chu, Seyoun Park, Satomi Kawamoto, Yan Wang, Yuyin Zhou, Wei Shen, Zhuotun Zhu, Yingda Xia, Lingxi Xie, Fengze Liu, Qihang Yu, Daniel F Fouladi, Shahab Shayesteh, Eva Zinreich, Jefferson S Graves, Karen M Horton, Alan Yuille, Ralph H Hruban, Kenneth W Kinzler, Bert Vogelstein, Elliot K Fishman",
            "Journal of the American College of Radiology, 2019",
            "https://www.jacr.org/article/S1546-1440(19)30631-3/abstract",
            "@article{chu2019application,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Application of deep learning to pancreatic cancer detection: lessons learned from our initial experience},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Chu, Linda C and Park, Seyoun and Kawamoto, Satomi and Wang, Yan and Zhou, Yuyin and Shen, Wei and Zhu, Zhuotun and Xia, Yingda and Xie, Lingxi and Liu, Fengze and others},<br>" +
             "&nbsp;&nbsp;&nbsp;journal   = {Journal of the American College of Radiology},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2019}<br>}",
            "Excitement has been steadily growing over the promise of artificial intelligence (AI) for radiology. Deep learning, a form of AI, uses training data and multiple layers of equations to develop a mathematical model that best fits the data [ 1 ]. The model can make predictions on the basis of new data. These algorithms deliver the prospect of improved disease detection and disease prognostication. As radiologists face increased pressure to read more cases each day, deep learning and other forms of AI offer the potential to serve as a “second reader” to decrease misses and increase efficiency. AI can analyze thousands of images on a pixel-by-pixel level and is not susceptible to mistakes due to fatigue, interruptions, or satisfaction of search.",
            "https://www.jacr.org/article/S1546-1440(19)30631-3/abstract"
        )


        add_paper("Hyper-Pairing Network for Multi-Phase Pancreatic Ductal Adenocarcinoma Segmentation",
            "Yuyin Zhou, Yingwei Li, Zhishuai Zhang, Yan Wang, Angtian Wang, Elliot K. Fishman, Alan Yuille, Seyoun Park",
            "MICCAI, 2019",
            "https://arxiv.org/abs/1909.00906",
            "@inproceedings{zhou2019hyper,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Hyper-pairing network for multi-phase pancreatic ductal adenocarcinoma segmentation},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Zhou, Yuyin and Li, Yingwei and Zhang, Zhishuai and Wang, Yan and Wang, Angtian and Fishman, Elliot K and Yuille, Alan and Park, Seyoun},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle     = {MICCAI},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2019}<br>}",
            "Pancreatic ductal adenocarcinoma (PDAC) is one of the most lethal cancers with an overall five-year survival rate of 8%. Due to subtle texture changes of PDAC, pancreatic dual-phase imaging is recommended for better diagnosis of pancreatic disease. In this study, we aim at enhancing PDAC automatic segmentation by integrating multi-phase information (i.e., arterial phase and venous phase). To this end, we present Hyper-Pairing Network (HPN), a 3D fully convolution neural network which effectively integrates information from different phases. The proposed approach consists of a dual path network where the two parallel streams are interconnected with hyper-connections for intensive information exchange. Additionally, a pairing loss is added to encourage the commonality between high-level feature representations of different phases. Compared to prior arts which use single phase data, HPN reports a significant improvement up to 7.73% (from 56.21% to 63.94%) in terms of DSC.",
            "https://arxiv.org/abs/1909.00906"
        )

        add_paper("Prior-aware Neural Network for Partially-Supervised Multi-Organ Segmentation",
            "Yuyin Zhou, Zhe Li, Song Bai, Chong Wang, Xinlei Chen, Mei Han, Elliot Fishman, Alan Yuille",
            "ICCV, 2019",
            "https://arxiv.org/abs/1904.06346",
            "@inproceedings{zhou2019prior,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Prior-aware neural network for partially-supervised multi-organ segmentation},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Zhou, Yuyin and Li, Zhe and Bai, Song and Wang, Chong and Chen, Xinlei and Han, Mei and Fishman, Elliot and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2019}<br>}",
            "Accurate multi-organ abdominal CT segmentation is essential to many clinical applications such as computer-aided intervention. As data annotation requires massive human labor from experienced radiologists, it is common that training data are partially labeled, e.g., pancreas datasets only have the pancreas labeled while leaving the rest marked as background. However, these background labels can be misleading in multi-organ segmentation since the \"background\" usually contains some other organs of interest. To address the background ambiguity in these partially-labeled datasets, we propose Prior-aware Neural Network (PaNN) via explicitly incorporating anatomical priors on abdominal organ sizes, guiding the training process with domain-specific knowledge. More specifically, PaNN assumes that the average organ size distributions in the abdomen should approximate their empirical distributions, a prior statistics obtained from the fully-labeled dataset. As our training objective is difficult to be directly optimized using stochastic gradient descent [20], we propose to reformulate it in a min-max form and optimize it via the stochastic primal-dual gradient algorithm. PaNN achieves state-of-the-art performance on the MICCAI2015 challenge \"Multi-Atlas Labeling Beyond the Cranial Vault\", a competition on organ segmentation in the abdomen. We report an average Dice score of 84.97%, surpassing the prior art by a large margin of 3.27%.",
            "https://arxiv.org/abs/1904.06346"
        )


        add_paper("Abdominal multi-organ segmentation with organ-attention networks and statistical fusion",
            "Yan Wang, Yuyin Zhou, Wei Shen, Seyoun Park, Elliot Fishman, Alan Yuille",
            "Medical Image Analysis, 2019",
            "https://arxiv.org/abs/1804.08414",
            "@article{wang2019abdominal,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Abdominal multi-organ segmentation with organ-attention networks and statistical fusion},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Wang, Yan and Zhou, Yuyin and Shen, Wei and Park, Seyoun and Fishman, Elliot K and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;journal   = {Medical Image Analysis},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2019}<br>}",
            "Accurate and robust segmentation of abdominal organs on CT is essential for many clinical applications such as computer-aided diagnosis and computer-aided surgery. But this task is challenging due to the weak boundaries of organs, the complexity of the background, and the variable sizes of different organs. To address these challenges, we introduce a novel framework for multi-organ segmentation of abdominal regions by using organ-attention networks with reverse connections (OAN-RCs) which are applied to 2D views, of the 3D CT volume, and output estimates which are combined by statistical fusion exploiting structural similarity. More specifically, OAN is a two-stage deep convolutional network, where deep network features from the first stage are combined with the original image, in a second stage, to reduce the complex background and enhance the discriminative information for the target organs. Intuitively, OAN reduces the effect of the complex background by focusing attention so that each organ only needs to be discriminated from its local background. RCs are added to the first stage to give the lower layers more semantic information thereby enabling them to adapt to the sizes of different organs. Our networks are trained on 2D views (slices) enabling us to use holistic information and allowing efficient computation (compared to using 3D patches). To compensate for the limited cross-sectional information of the original 3D volumetric CT, e.g., the connectivity between neighbor slices, multi-sectional images are reconstructed from the three different 2D view directions. Then we combine the segmentation results from the different views using statistical fusion, with a novel term relating the structural similarity of the 2D views to the original 3D structure. To train the network and evaluate results, 13 structures were manually annotated by four human raters and confirmed by a senior expert on 236 normal cases. We tested our algorithm by 4-fold cross-validation and computed Dice–Sørensen similarity coefficients (DSC) and surface distances for evaluating our estimates of the 13 structures. Our experiments show that the proposed approach gives strong results and outperforms 2D- and 3D-patch based state-of-the-art methods in terms of DSC and mean surface distances.",
            "https://arxiv.org/abs/1804.08414"
        )

        add_paper("Semi- Supervised 3D Multi-Organ Segmentation via Deep Multi-Planar Co-Training",
            "Yuyin Zhou, Yan Wang, Peng Tang, Song Bai, Wei Shen, Elliot Fishman, Alan Yuille",
            "WACV, 2019",
            "https://arxiv.org/abs/1804.02586",
            "@inproceedings{zhou2019semi,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Semi-supervised 3D abdominal multi-organ segmentation via deep multi-planar co-training},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Zhou, Yuyin and Wang, Yan and Tang, Peng and Bai, Song and Shen, Wei and Fishman, Elliot and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {2019 WACV},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2019}<br>}",
            "In multi-organ segmentation of abdominal CT scans, most existing fully supervised deep learning algorithms require lots of voxel-wise annotations, which are usually difficult, expensive, and slow to obtain. In comparison, massive unlabeled 3D CT volumes are usually easily accessible. Current mainstream works to address semi-supervised biomedical image segmentation problem are mostly graph-based. By contrast, deep network based semi-supervised learning methods have not drawn much attention in this field. In this work, we propose Deep Multi-Planar Co-Training (DMPCT), whose contributions can be divided into two folds: 1) The deep model is learned in a co-training style which can mine consensus information from multiple planes like the sagittal, coronal, and axial planes; 2) Multi-planar fusion is applied to generate more reliable pseudo-labels, which alleviates the errors occurring in the pseudo-labels and thus can help to train better segmentation networks. Experiments are done on our newly collected large dataset with 100 unlabeled cases as well as 210 labeled cases where 16 anatomical structures are manually annotated by four radiologists and confirmed by a senior expert. The results suggest that DMPCT significantly outperforms the fully supervised method by more than 4% especially when only a small set of annotations is used.",
            "https://arxiv.org/abs/1804.02586"
        )

        add_paper("Volumetric medical image segmentation: a 3D deep coarse-to-fine framework and its adversarial examples",
            "Yingwei Li, Zhuotun Zhu, Yuyin Zhou, Yingda Xia, Wei Shen, Elliot K Fishman, Alan Yuille",
            "Deep Learning and Convolutional Neural Networks for Medical Imaging and Clinical Informatics, 2019",
            "https://arxiv.org/abs/2010.16074",
            "@incollection{li2019volumetric,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Volumetric medical image segmentation: a 3D deep coarse-to-fine framework and its adversarial examples}, <br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Li, Yingwei and Zhu, Zhuotun and Zhou, Yuyin and Xia, Yingda and Shen, Wei and Fishman, Elliot K and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {Deep Learning and Convolutional Neural Networks for Medical Imaging and Clinical Informatics},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2019}<br>}",
            "Although deep neural networks have been a dominant method for many 2D vision tasks, it is still challenging to apply them to 3D tasks, such as medical image segmentation, due to the limited amount of annotated 3D data and limited computational resources. In this chapter, by rethinking the strategy to apply 3D Convolutional Neural Networks to segment medical images, we propose a novel 3D-based coarse-to-fine framework to efficiently tackle these challenges. The proposed 3D-based framework outperforms their 2D counterparts by a large margin since it can leverage the rich spatial information along all three axes. We further analyze the threat of adversarial attacks on the proposed framework and show how to defend against the attack. We conduct experiments on three datasets, the NIH pancreas dataset, the JHMI pancreas dataset and the JHMI pathological cyst dataset, where the first two and the last one contain healthy and pathological pancreases, respectively, and achieve the current state of the art in terms of Dice-Sørensen Coefficient (DSC) on all of them. Especially, on the NIH pancreas dataset, we outperform the previous best by an average of over 2%, and the worst case is improved by 7% to reach almost 70%, which indicates the reliability of our framework in clinical applications.",
            "https://arxiv.org/abs/2010.16074"
        )


        add_paper("2D-Based Coarse-to-Fine Approaches for Small Target Segmentation in Abdominal CT Scans",
            "Yuyin Zhou, Qihang Yu, Yan Wang, Lingxi Xie, Wei Shen, Elliot K Fishman, Alan Yuille",
            "Deep Learning and Convolutional Neural Networks for Medical Imaging and Clinical Informatics, 2019",
            "https://link.springer.com/chapter/10.1007/978-3-030-13969-8_3",
            "@incollection{zhou20192d,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {2D-Based Coarse-to-Fine Approaches for Small Target Segmentation in Abdominal CT Scans},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Zhou, Yuyin and Yu, Qihang and Wang, Yan and Xie, Lingxi and Shen, Wei and Fishman, Elliot K and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {Deep Learning and Convolutional Neural Networks for Medical Imaging and Clinical Informatics},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2019}<br>}",
            "Deep neural networks have been widely adopted for automatic organ segmentation from abdominal CT scans. However, the segmentation accuracy of small organs (e.g., pancreas) or neoplasms (e.g., pancreatic cyst) is sometimes below satisfaction, arguably because deep networks are easily disrupted by the complex and variable background regions which occupy a large fraction of the input volume. In this chapter, we propose two coarse-to-fine mechanisms which use prediction from the first (coarse) stage to shrink the input region for the second (fine) stage. More specifically, the two stages in the first method are trained individually in a step-wise manner, so that the entire input region and the region cropped according to the bounding box are treated separately. While the second method inserts a saliency transformation module between the two stages so that the segmentation probability map from the previous iteration can be repeatedly converted as spatial weights to the current iteration. In training, it allows joint optimization over the deep networks. In testing, it propagates multi-stage visual information throughout iterations to improve segmentation accuracy. Experiments are performed on several CT datasets, including NIH pancreas, JHMI multi-organ, and JHMI pancreatic cyst dataset. Our proposed approach gives strong results in terms of DSC.",
            "https://link.springer.com/chapter/10.1007/978-3-030-13969-8_3"
        )



        document.write("</ul>")
        document.write("<h1>2018</h1>")
        document.write("<ul>")


        add_paper("Mitigating Adversarial Effects Through Randomization",
            "Cihang Xie, Jianyu Wang, Zhishuai Zhang, Zhou Ren, Alan Yuille",
            "ICLR, 2018",
            "https://arxiv.org/abs/1711.01991",
            "@inproceedings{xie2018mitigating,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Mitigating Adversarial Effects Through Randomization},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Xie, Cihang and Wang, Jianyu and Zhang, Zhishuai and Ren, Zhou and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {ICLR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2018}<br>}",
            "Convolutional neural networks have demonstrated high accuracy on various tasks in recent years. However, they are extremely vulnerable to adversarial examples. For example, imperceptible perturbations added to clean images can cause convolutional neural networks to fail. In this paper, we propose to utilize randomization at inference time to mitigate adversarial effects. Specifically, we use two randomization operations: random resizing, which resizes the input images to a random size, and random padding, which pads zeros around the input images in a random manner. Extensive experiments demonstrate that the proposed randomization method is very effective at defending against both single-step and iterative attacks. Our method provides the following advantages: 1) no additional training or f ine-tuning, 2) very few additional computations, 3) compatible with other adversarial defense methods. By combining the proposed randomization method with an adversarially trained model, it achieves a normalized score of 0.924 (ranked No.2 among 107 defense teams) in the NIPS 2017 adversarial examples defense challenge, which is far better than using adversarial training alone with a normalized score of 0.773 (ranked No.56). The code is public available at https://github.com/cihangxie/NIPS2017_adv_challenge_defense",
            "https://arxiv.org/abs/1711.01991",
            "https://github.com/cihangxie/NIPS2017_adv_challenge_defense",
        )

        add_paper("Single-Shot Object Detection with Enriched Semantics",
            "Zhishuai Zhang, Siyuan Qiao, Cihang Xie, Wei Shen, Bo Wang, Alan Yuille",
            "CVPR, 2018",
            "https://arxiv.org/abs/1712.00433",
            "@inproceedings{zhang2018des,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Single-Shot Object Detection with Enriched Semantics},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Zhang, Zhishuai and Qiao, Siyuan and Xie, Cihang and Shen, Wei and Wang, Bo and Yuille, Alan.},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {CVPR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2018}<br>}",
            "We propose a novel single shot object detection network named Detection with Enriched Semantics (DES). Our motivation is to enrich the semantics of object detection features within a typical deep detector, by a semantic segmentation branch and a global activation module. The segmentation branch is supervised by weak segmentation ground-truth, i.e., no extra annotation is required. In conjunction with that, we employ a global activation module which learns relationship between channels and object classes in a self-supervised manner. Comprehensive experimental results on both PASCAL VOC and MS COCO detection datasets demonstrate the effectiveness of the proposed method. In particular, with a VGG16 based DES, we achieve an mAP of 81.7 on VOC2007 test and an mAP of 32.8 on COCO test-dev with an inference speed of 31.5 milliseconds per image on a Titan Xp GPU. With a lower resolution version, we achieve an mAP of 79.7 on VOC2007 with an inference speed of 13.0 milliseconds per image.",
            "https://arxiv.org/abs/1712.00433",
            "https://github.com/bairdzhang/des"
        )

        add_paper("DeepVoting: A Robust and Explainable Deep Network for Semantic Part Detection under Partial Occlusion",
            "Zhishuai Zhang, Cihang Xie, Jianyu Wang, Lingxi Xie, Alan Yuille",
            "CVPR, 2018",
            "https://arxiv.org/abs/1709.04577",
            "@inproceedings{zhang2018deepvoting,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {DeepVoting: a robust and explainable deep network for semantic part detection under partial occlusion},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Zhang, Zhishuai and Xie, Cihang and Wang, Jianyu and Xie, Lingxi and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {CVPR},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2018}<br>}",
            "In this paper, we study the task of detecting semantic parts of an object, e.g., a wheel of a car, under partial occlusion. We propose that all models should be trained without seeing occlusions while being able to transfer the learned knowledge to deal with occlusions. This setting alleviates the difficulty in collecting an exponentially large dataset to cover occlusion patterns and is more essential. In this scenario, the proposal-based deep networks, like RCNN-series, often produce unsatisfactory results, because both the proposal extraction and classification stages may be confused by the irrelevant occluders. To address this, [25] proposed a voting mechanism that combines multiple local visual cues to detect semantic parts. The semantic parts can still be detected even though some visual cues are missing due to occlusions. However, this method is manually-designed, thus is hard to be optimized in an end-to-end manner. In this paper, we present DeepVoting, which incorporates the robustness shown by [25] into a deep network, so that the whole pipeline can be jointly optimized. Specifically, it adds two layers after the intermediate features of a deep network, e.g., the pool-4 layer of VGGNet. The first layer extracts the evidence of local visual cues, and the second layer performs a voting mechanism by utilizing the spatial relationship between visual cues and semantic parts. We also propose an improved version DeepVoting+ by learning visual cues from context outside objects. In experiments, DeepVoting achieves significantly better performance than several baseline methods, including Faster-RCNN, for semantic part detection under occlusion. In addition, DeepVoting enjoys explainability as the detection results can be diagnosed via looking up the voting cues.",
            "https://arxiv.org/abs/1709.04577"
        )

        add_paper("Recurrent Saliency Transformation Network: Incorporating Multi-Stage Visual Cues for Small Organ Segmentation",
            "Qihang Yu, Lingxi Xie, Yan Wang, Yuyin Zhou, Elliot Fishman, Alan Yuille",
            "CVPR, 2018",
            "https://openaccess.thecvf.com/content_cvpr_2018/papers/Yu_Recurrent_Saliency_Transformation_CVPR_2018_paper.pdf",
            "@inproceedings{yu2018recurrent,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Recurrent saliency transformation network: Incorporating multi-stage visual cues for small organ segmentation},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Yu, Qihang and Xie, Lingxi and Wang, Yan and Zhou, Yuyin and Fishman, Elliot K and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, <br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2018}<br>}",
            "We aim at segmenting small organs (e.g., the pancreas) from abdominal CT scans. As the target often occupies a relatively small region in the input image, deep neural networks can be easily confused by the complex and variable background. To alleviate this, researchers proposed a coarse-to-fine approach, which used prediction from the first (coarse) stage to indicate a smaller input region for the second (fine) stage. Despite its effectiveness, this algorithm dealt with two stages individually, which lacked optimizing a global energy function, and limited its ability to incorporate multi-stage visual cues. Missing contextual information led to unsatisfying convergence in iterations, and that the fine stage sometimes produced even lower segmentation accuracy than the coarse stage. This paper presents a Recurrent Saliency Transformation Network. The key innovation is a saliency transformation module, which repeatedly converts the segmentation probability map from the previous iteration as spatial weights and applies these weights to the current iteration. This brings us two-fold benefits. In training, it allows joint optimization over the deep networks dealing with different input scales. In testing, it propagates multi-stage visual information throughout iterations to improve segmentation accuracy. Experiments in the NIH pancreas segmentation dataset demonstrate the state-of-the-art accuracy, which outperforms the previous best by an average of over 2%. Much higher accuracies are also reported on several small organs in a larger dataset collected by ourselves. In addition, our approach enjoys better convergence properties, making it more efficient and reliable in practice.",
            "https://openaccess.thecvf.com/content_cvpr_2018/papers/Yu_Recurrent_Saliency_Transformation_CVPR_2018_paper.pdf"
        )

         add_paper("Visual Concepts and Compositional Voting",
            "Jianyu Wang, Zhishuai Zhang, Cihang Xie, Yuyin Zhou, Vittal Premachandran, Jun Zhu, Lingxi Xie, Alan Yuille",
            "Annals of Mathematical Sciences and Applications, 2018",
            "https://arxiv.org/abs/1711.04451",
            "@article{wang2018vcsp,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Visual concepts and compositional voting},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Wang, Jianyu and Zhang, Zhishuai and Xie, Cihang and Zhou, Yuyin and Premachandran, Vittal and Zhu, Jun and Xie, Lingxi and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;journal   = {Annals of Mathematical Sciences and Applications},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2018},<br>",
            "It is very attractive to formulate vision in terms of pattern theory [26], where patterns are defined hierarchically by compositions of elementary building blocks. But applying pattern theory to real world images is currently less successful than discriminative methods such as deep networks. Deep networks, however, are black-boxes which are hard to interpret and can easily be fooled by adding occluding objects. It is natural to wonder whether by better understanding deep networks we can extract building blocks which can be used to develop pattern theoretic models. This motivates us to study the internal representations of a deep network using vehicle images from the PASCAL3D+ dataset. We use clustering algorithms to study the population activities of the features and extract a set of visual concepts which we show are visually tight and correspond to semantic parts of vehicles. To analyze this we annotate these vehicles by their semantic parts to create a new dataset, VehicleSemanticParts, and evaluate visual concepts as unsupervised part detectors. We show that visual concepts perform fairly well but are outperformed by supervised discriminative methods such as Support Vector Machines (SVM). We next give a more detailed analysis of visual concepts and how they relate to semantic parts. Following this, we use the visual concepts as building blocks for a simple pattern theoretical model, which we call compositional voting. In this model several visual concepts combine to detect semantic parts. We show that this approach is significantly better than discriminative methods like SVM and deep networks trained specifically for semantic part detection. Finally, we return to studying occlusion by creating an annotated dataset with occlusion, called VehicleOcclusion, and show that compositional voting outperforms even deep networks when the amount of occlusion becomes large.",
            "https://arxiv.org/abs/1711.04451"
        )

         add_paper("Training Multi-organ Segmentation Networks with Sample Selection by Relaxed Upper Confident Bound",
            "Yan Wang, Yuyin Zhou, Peng Tang, Wei Shen, Elliot Fishman, Alan Yuille",
            "MICCAI, 2018",
            "https://link.springer.com/chapter/10.1007/978-3-030-00937-3_50",
            "@inproceedings{wang2018training,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Training multi-organ segmentation networks with sample selection by relaxed upper confident bound},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Wang, Yan and Zhou, Yuyin and Tang, Peng and Shen, Wei and Fishman, Elliot K and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {MICCAI},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2018}<br>}",
            "Convolutional neural networks (CNNs), especially fully convolutional networks, have been widely applied to automatic medical image segmentation problems, e.g., multi-organ segmentation. Existing CNN-based segmentation methods mainly focus on looking for increasingly powerful network architectures, but pay less attention to data sampling strategies for training networks more effectively. In this paper, we present a simple but effective sample selection method for training multi-organ segmentation networks. Sample selection exhibits an exploitation-exploration strategy, i.e., exploiting hard samples and exploring less frequently visited samples. Based on the fact that very hard samples might have annotation errors, we propose a new sample selection policy, named Relaxed Upper Confident Bound (RUCB). Compared with other sample selection policies, e.g., Upper Confident Bound (UCB), it exploits a range of hard samples rather than being stuck with a small set of very hard ones, which mitigates the influence of annotation errors during training. We apply this new sample selection policy to training a multi-organ segmentation network on a dataset containing 120 abdominal CT scans and show that it boosts segmentation performance significantly.",
            "https://link.springer.com/chapter/10.1007/978-3-030-00937-3_50"
        )

        document.write("</ul>")
        document.write("<h1>2017</h1>")
        document.write("<ul>")

        add_paper("Adversarial Examples for Semantic Segmentation and Object Detection",
            "Jianyu Wang, Cihang Xie, Zhishuai Zhang, Jun Zhu, Lingxi Xie, Alan Yuille",
            "ICCV, 2017",
            "https://arxiv.org/abs/1703.08603",
            "@inproceedings{xie2017dag,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Adversarial Examples for Semantic Segmentation and Object Detection},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Xie, Cihang and Wang, Jianyu and Zhang, Zhishuai and Zhou, Yuyin and Xie, Lingxi and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {ICCV},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2017}<br>}",
            "It has been well demonstrated that adversarial examples, i.e., natural images with visually imperceptible perturbations added, generally exist for deep networks to fail on image classification. In this paper, we extend adversarial examples to semantic segmentation and object detection which are much more difficult. Our observation is that both segmentation and detection are based on classifying multiple targets on an image (e.g., the basic target is a pixel or a receptive field in segmentation, and an object proposal in detection), which inspires us to optimize a loss function over a set of pixels/proposals for generating adversarial perturbations. Based on this idea, we propose a novel algorithm named Dense Adversary Generation (DAG), which generates a large family of adversarial examples, and applies to a wide range of state-of-the-art deep networks for segmentation and detection. We also find that the adversarial perturbations can be transferred across networks with different training data, based on different architectures, and even for different recognition tasks. In particular, the transferability across networks with the same architecture is more significant than in other cases. Besides, summing up heterogeneous perturbations often leads to better transfer performance, which provides an effective method of black-box adversarial attack.",
            "https://arxiv.org/abs/1703.08603",
            "https://github.com/cihangxie/DAG",
        )

        add_paper("Detecting Semantic Parts on Partially Occluded Objects",
            "Jianyu Wang, Cihang Xie, Zhishuai Zhang, Jun Zhu, Lingxi Xie, Alan Yuille",
            "BMVC, 2017",
            "https://arxiv.org/abs/1707.07819",
            "@inproceedings{wang2017voting,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Detecting semantic parts on partially occluded objects},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Wang, Jianyu and Xie, Cihang and Zhang, Zhishuai and Zhu, Jun and Xie, Lingxi and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {BMVC},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2017}<br>}",
            "In this paper, we address the task of detecting semantic parts on partially occluded objects. We consider a scenario where the model is trained using non-occluded images but tested on occluded images. The motivation is that there are infinite number of occlusion patterns in real world, which cannot be fully covered in the training data. So the models should be inherently robust and adaptive to occlusions instead of fitting / learning the occlusion patterns in the training data. Our approach detects semantic parts by accumulating the confidence of local visual cues. Specifically, the method uses a simple voting method, based on log-likelihood ratio tests and spatial constraints, to combine the evidence of local cues. These cues are called visual concepts, which are derived by clustering the internal states of deep networks. We evaluate our voting scheme on the VehicleSemanticPart dataset with dense part annotations. We randomly place two, three or four irrelevant objects onto the target object to generate testing images with various occlusions. Experiments show that our algorithm outperforms several competitors in semantic part detection when occlusions are present.",
            "https://arxiv.org/abs/1707.07819"
        )

        add_paper("A Fixed- Point Model for Pancreas Segmentation in Abdominal CT Scans",
            "Yuyin Zhou, Lingxi Xie, Cihang Xie, Wei Shen, Yan Wang, Elliot Fishman, Alan Yuille",
            "MICCAI, 2017",
            "https://link.springer.com/chapter/10.1007/978-3-319-66182-7_79",
            "@inproceedings{zhou2017fixed,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {A fixed-point model for pancreas segmentation in abdominal CT scans},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Zhou, Yuyin and Xie, Lingxi and Shen, Wei and Wang, Yan and Fishman, Elliot K and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {MICCAI},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2017}<br>}",
            "Deep neural networks have been widely adopted for automatic organ segmentation from abdominal CT scans. However, the segmentation accuracy of some small organs (e.g., the pancreas) is sometimes below satisfaction, arguably because deep networks are easily disrupted by the complex and variable background regions which occupies a large fraction of the input volume. In this paper, we formulate this problem into a fixed-point model which uses a predicted segmentation mask to shrink the input region. This is motivated by the fact that a smaller input region often leads to more accurate segmentation. In the training process, we use the ground-truth annotation to generate accurate input regions and optimize network weights. On the testing stage, we fix the network parameters and update the segmentation results in an iterative manner. We evaluate our approach on the NIH pancreas segmentation dataset, and outperform the state-of-the-art by more than 4%, measured by the average Dice-Sørensen Coefficient (DSC). In addition, we report 62.43% DSC in the worst case, which guarantees the reliability of our approach in clinical applications.",
            "https://link.springer.com/chapter/10.1007/978-3-319-66182-7_79"
        )

        add_paper("Deep Supervision for Pancreatic Cyst Segmentation in Abdominal CT Scans",
            "Yuyin Zhou, Lingxi Xie, Elliot Fishman, Alan Yuille",
            "MICCAI, 2017",
            "https://arxiv.org/abs/1706.07346",
            "@inproceedings{zhou2017deep,<br>" +
             "&nbsp;&nbsp;&nbsp;title     = {Deep supervision for pancreatic cyst segmentation in abdominal CT scans},<br>" +
             "&nbsp;&nbsp;&nbsp;author    = {Zhou, Yuyin and Xie, Lingxi and Fishman, Elliot K and Yuille, Alan},<br>" +
             "&nbsp;&nbsp;&nbsp;booktitle = {MICCAI},<br>" +
             "&nbsp;&nbsp;&nbsp;year      = {2017}<br>}",
            "Automatic segmentation of an organ and its cystic region is a prerequisite of computer-aided diagnosis. In this paper, we focus on pancreatic cyst segmentation in abdominal CT scan. This task is important and very useful in clinical practice yet challenging due to the low contrast in boundary, the variability in location, shape and the different stages of the pancreatic cancer. Inspired by the high relevance between the location of a pancreas and its cystic region, we introduce extra deep supervision into the segmentation network, so that cyst segmentation can be improved with the help of relatively easier pancreas segmentation. Under a reasonable transformation function, our approach can be factorized into two stages, and each stage can be efficiently optimized via gradient back-propagation throughout the deep networks. We collect a new dataset with 131 pathological samples, which, to the best of our knowledge, is the largest set for pancreatic cyst segmentation. Without human assistance, our approach reports a 63.44% average accuracy, measured by the Dice-Sørensen coefficient (DSC), which is higher than the number (60.46%) without deep supervision.",
            "https://arxiv.org/abs/1706.07346"
        )

</script>

</script>

<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.0/jquery.min.js"></script>
</div>






  <!-- ======= Footer ======= -->
  <footer id="footer">

    <div class="container d-md-flex py-4">

      <div class="me-md-auto text-center text-md-start">
        <div class="copyright">
          &copy; Copyright <strong><span>VLAA</span></strong>. Partially borrowed from <a href="https://www.cs.princeton.edu/~danqic/papers.html">Danqi Chen</a>
        </div>
        <div class="credits">
          Designed by <a href="https://bootstrapmade.com/">BootstrapMade</a>
        </div>
      </div>
    </div>
  </footer><!-- End Footer -->

  <div id="preloader"></div>
  <a href="#" class="back-to-top d-flex align-items-center justify-content-center"><i class="bi bi-arrow-up-short"></i></a>

  <!-- Vendor JS Files -->
  <script src="assets/vendor/purecounter/purecounter.js"></script>
  <script src="assets/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
  <script src="assets/vendor/glightbox/js/glightbox.min.js"></script>
  <script src="assets/vendor/swiper/swiper-bundle.min.js"></script>
  <script src="assets/vendor/php-email-form/validate.js"></script>

  <!-- Template Main JS File -->
  <script src="assets/js/main.js"></script>

</body>

</html>