Re-se-arch
Our re-se-arch has been generously supported by ARO, NSF, ARFL, IARPA, BlueHalo and Salesforce.
2024
Jin, Richeng; Liu, Yuding; Huang, Yufan; He, Xiaofan; Wu, Tianfu; Dai, Huaiyu
Sign-Based Gradient Descent With Heterogeneous Data: Convergence and Byzantine Resilience Journal Article
In: IEEE Transactions on Neural Networks and Learning Systems, pp. 1-13, 2024.
@article{10398739,
title = {Sign-Based Gradient Descent With Heterogeneous Data: Convergence and Byzantine Resilience},
author = {Richeng Jin and Yuding Liu and Yufan Huang and Xiaofan He and Tianfu Wu and Huaiyu Dai},
doi = {10.1109/TNNLS.2023.3345367},
year = {2024},
date = {2024-01-01},
journal = {IEEE Transactions on Neural Networks and Learning Systems},
pages = {1-13},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2023
Xue, Nan; Wu, Tianfu; Bai, Song; Wang, Fu-Dong; Xia, Gui-Song; Zhang, Liangpei; Torr, Philip H. S.
Holistically-Attracted Wireframe Parsing: From Supervised to Self-Supervised Learning Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI), vol. 45, no. 12, pp. 14727-14744, 2023.
@article{nokey,
title = {Holistically-Attracted Wireframe Parsing: From Supervised to Self-Supervised Learning},
author = {Nan Xue and Tianfu Wu and Song Bai and Fu-Dong Wang and Gui-Song Xia and Liangpei Zhang and Philip H.S. Torr},
url = {https://arxiv.org/abs/2210.12971},
doi = {10.1109/TPAMI.2023.3312749},
year = {2023},
date = {2023-12-01},
urldate = {2023-03-14},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)},
volume = {45},
number = {12},
pages = {14727-14744},
abstract = {This paper presents Holistically-Attracted Wireframe Parsing (HAWP) for 2D images using both fully supervised and self-supervised learning paradigms. At the core is a parsimonious representation that encodes a line segment using a closed-form 4D geometric vector, which enables lifting line segments in wireframe to an end-to-end trainable holistic attraction field that has built-in geometry-awareness, context-awareness and robustness. The proposed HAWP consists of three components: generating line segment and end-point proposal, binding line segment and end-point, and end-point-decoupled lines-of-interest verification. For self-supervised learning, a simulation-to-reality pipeline is exploited in which a HAWP is first trained using synthetic data and then used to ``annotate" wireframes in real images with Homographic Adaptation. With the self-supervised annotations, a HAWP model for real images is trained from scratch. In experiments, the proposed HAWP achieves state-of-the-art performance in both the Wireframe dataset and the YorkUrban dataset in fully-supervised learning. It also demonstrates a significantly better repeatability score than prior arts with much more efficient training in self-supervised learning. Furthermore, the self-supervised HAWP shows great potential for general wireframe parsing without onerous wireframe labels.},
howpublished = {arXiv preprint},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Tan, Bin; Xue, Nan; Wu, Tianfu; Xia, Gui-Song
NOPE-SAC: Neural One-Plane RANSAC for Sparse-View Planar 3D Reconstruction Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI), vol. 45, no. 12, pp. 5233-15248, 2023.
@article{nope-sac,
title = {NOPE-SAC: Neural One-Plane RANSAC for Sparse-View Planar 3D Reconstruction},
author = {Bin Tan and Nan Xue and Tianfu Wu and Gui-Song Xia},
url = {https://arxiv.org/abs/2211.16799},
doi = {10.1109/TPAMI.2023.3314745},
year = {2023},
date = {2023-12-01},
urldate = {2023-05-23},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)},
volume = {45},
number = {12},
pages = {5233-15248},
abstract = {This paper studies the challenging two-view 3D reconstruction in a rigorous sparse-view configuration, which is suffering from insufficient correspondences in the input image pairs for camera pose estimation. We present a novel Neural One-PlanE RANSAC framework (termed NOPE-SAC in short) that exerts excellent capability to learn one-plane pose hypotheses from 3D plane correspondences. Building on the top of a siamese plane detection network, our NOPE-SAC first generates putative plane correspondences with a coarse initial pose. It then feeds the learned 3D plane parameters of correspondences into shared MLPs to estimate the one-plane camera pose hypotheses, which are subsequently reweighed in a RANSAC manner to obtain the final camera pose. Because the neural one-plane pose minimizes the number of plane correspondences for adaptive pose hypotheses generation, it enables stable pose voting and reliable pose refinement in a few plane correspondences for the sparse-view inputs. In the experiments, we demonstrate that our NOPE-SAC significantly improves the camera pose estimation for the two-view inputs with severe viewpoint changes, setting several new state-of-the-art performances on two challenging benchmarks, i.e., MatterPort3D and ScanNet, for sparse-view 3D reconstruction. The source code is released at https://github.com/IceTTTb/NopeSAC for reproducible research.},
howpublished = {arXiv preprint},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Jiang, Bo; Krim, Hamid; Wu, Tianfu; Cansever, Derya
Implicit Bayes Adaptation: A Collaborative Transport Approach Proceedings Article
In: IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023, Rhodes Island, Greece, June 4-10, 2023, pp. 1–5, 2023.
@inproceedings{DBLP:conf/icassp/JiangKWC23,
title = {Implicit Bayes Adaptation: A Collaborative Transport Approach},
author = {Bo Jiang and Hamid Krim and Tianfu Wu and Derya Cansever},
url = {https://doi.org/10.1109/ICASSP49357.2023.10096599},
doi = {10.1109/ICASSP49357.2023.10096599},
year = {2023},
date = {2023-01-01},
booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing
ICASSP 2023, Rhodes Island, Greece, June 4-10, 2023},
pages = {1\textendash5},
crossref = {DBLP:conf/icassp/2023},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xiao, Yuxi; Xue, Nan; Wu, Tianfu; Xia, Gui-Song
Level-S$^2$fM: Structure from Motion on Neural Level Set of Implicit Surfaces Proceedings Article
In: CVPR, 2023.
@inproceedings{levelS2fM,
title = {Level-S$^2$fM: Structure from Motion on Neural Level Set of Implicit Surfaces},
author = {Yuxi Xiao and Nan Xue and Tianfu Wu and Gui-Song Xia},
url = {https://arxiv.org/abs/2211.12018},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
booktitle = {CVPR},
abstract = {This paper presents a neural incremental Structure-from-Motion (SfM) approach, Level-S2fM. In our formulation, we aim at simultaneously learning coordinate MLPs for the implicit surfaces and the radiance fields, and estimating the camera poses and scene geometry, which is mainly sourced from the established keypoint correspondences by SIFT. Our formulation would face some new challenges due to inevitable two-view and few-view configurations at the beginning of incremental SfM pipeline for the optimization of coordinate MLPs, but we found that the strong inductive biases conveying in the 2D correspondences are feasible and promising to avoid those challenges by exploiting the relationship between the ray sampling schemes used in volumetric rendering and the sphere tracing of finding the zero-level set of implicit surfaces. Based on this, we revisit the pipeline of incremental SfM and renew the key components of two-view geometry initialization, the camera pose registration, and the 3D points triangulation, as well as the Bundle Adjustment in a novel perspective of neural implicit surfaces. Because the coordinate MLPs unified the scene geometry in small MLP networks, our Level-S2fM treats the zero-level set of the implicit surface as an informative top-down regularization to manage the reconstructed 3D points, reject the outlier of correspondences by querying SDF, adjust the estimated geometries by NBA (Neural BA), finally yielding promising results of 3D reconstruction. Furthermore, our Level-S2fM alleviated the requirement of camera poses for neural 3D reconstruction.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Grainger, Ryan; Paniagua, Thomas; Song, Xi; Cuntoor, Naresh; Lee, Mun Wai; Wu, Tianfu
PaCa-ViT: Learning Patch-to-Cluster Attention in Vision Transformers Proceedings Article
In: CVPR, 2023.
@inproceedings{PaCaViTb,
title = {PaCa-ViT: Learning Patch-to-Cluster Attention in Vision Transformers},
author = {Ryan Grainger and Thomas Paniagua and Xi Song and Naresh Cuntoor and Mun Wai Lee and Tianfu Wu},
url = {https://arxiv.org/abs/2203.11987},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
booktitle = {CVPR},
abstract = {Vision Transformers (ViTs) are built on the assumption of treating image patches as ``visual tokens" and learn patch-to-patch attention. The patch embedding based tokenizer has a semantic gap with respect to its counterpart, the textual tokenizer. The patch-to-patch attention suffers from the quadratic complexity issue, and also makes it non-trivial to explain learned ViTs. To address these issues in ViT, this paper proposes to learn Patch-to-Cluster attention (PaCa) in ViT. Queries in our PaCa-ViT starts with patches, while keys and values are directly based on clustering (with a predefined small number of clusters). The clusters are learned end-to-end, leading to better tokenizers and inducing joint clustering-for-attention and attention-for-clustering for better and interpretable models. The quadratic complexity is relaxed to linear complexity. The proposed PaCa module is used in designing efficient and interpretable ViT backbones and semantic segmentation head networks. In experiments, the proposed methods are tested on ImageNet-1k image classification, MS-COCO object detection and instance segmentation and MIT-ADE20k semantic segmentation. Compared with the prior art, it obtains better performance in all the three benchmarks than the SWin and the PVTs by significant margins in ImageNet-1k and MIT-ADE20k. It is also significantly more efficient than PVT models in MS-COCO and MIT-ADE20k due to the linear complexity. The learned clusters are semantically meaningful.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Kashyap, Priyank; Ravichandiran, Prasanth Prabu; Wang, Lee; Baron, Dror; Wong, Chau-Wai; Wu, Tianfu; Franzon, Paul D.
Thermal Estimation for 3D-ICs Through Generative Networks Proceedings Article
In: IEEE International 3D Systems Integration Conference, 3DIC 2023, Cork, Ireland, May 10-12, 2023, pp. 1–4, 2023.
@inproceedings{DBLP:conf/3dic/KashyapRWBWWF23,
title = {Thermal Estimation for 3D-ICs Through Generative Networks},
author = {Priyank Kashyap and Prasanth Prabu Ravichandiran and Lee Wang and Dror Baron and Chau-Wai Wong and Tianfu Wu and Paul D. Franzon},
url = {https://doi.org/10.1109/3DIC57175.2023.10154977},
doi = {10.1109/3DIC57175.2023.10154977},
year = {2023},
date = {2023-01-01},
booktitle = {IEEE International 3D Systems Integration Conference, 3DIC 2023,
Cork, Ireland, May 10-12, 2023},
pages = {1\textendash4},
crossref = {DBLP:conf/3dic/2023},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Savadikar, Chinmay; Dai, Michelle; Wu, Tianfu
Learning to Grow Artificial Hippocampi in Vision Transformers for Resilient Lifelong Learning Online
2023, visited: 14.03.2023.
@online{artihippo,
title = {Learning to Grow Artificial Hippocampi in Vision Transformers for Resilient Lifelong Learning},
author = {Chinmay Savadikar and Michelle Dai and Tianfu Wu},
url = {https://arxiv.org/pdf/2303.08250.pdf},
year = {2023},
date = {2023-03-14},
urldate = {2023-03-14},
abstract = {Lifelong learning without catastrophic forgetting (i.e., resiliency) possessed by human intelligence is entangled with sophisticated memory mechanisms in the brain, especially the long-term memory (LM) maintained by Hippocampi. To a certain extent, Transformers have emerged as the counterpart “Brain” of Artificial Intelligence (AI), and yet leave the LM component under-explored for lifelong learning settings. This paper presents a method of learning to grow Artificial Hippocampi
(ArtiHippo) in Vision Transformers (ViTs) for resilient lifelong learning. With a comprehensive ablation study, the final linear projection layer in the multi-head self-attention (MHSA) block is selected in realizing and growing ArtiHippo. ArtiHippo is represented by a mixture of experts (MoEs). Each expert component is an on-site variant of the linear projection layer, which is maintained via neural architecture search (NAS) with the search space defined by four basic growing operations \textendash skip, reuse, adapt, and new in lifelong learning. The LM of a task consists of two parts: the dedicated expert components (as model parameters) at different layers of a ViT learned via NAS, and the mean class-tokens (as stored latent vectors for measuring task similarity) associated with the expert components. For a new task, a hierarchical task-similarity-oriented exploration-exploitation sampling based NAS is proposed to learn the expert components. The task similarity is measured based on the normalized cosine similarity between the mean class-token of the new task and those of old tasks. The proposed method is complementary to prompt-based lifelong learning with ViTs. In experiments, the proposed method is tested on the challenging Visual Domain Decathlon (VDD) benchmark and the recently proposed 5-Dataset benchmark. It obtains consistently better performance than the prior art with sensible ArtiHippo learned continually},
howpublished = {arXiv preprint},
keywords = {},
pubstate = {published},
tppubtype = {online}
}
(ArtiHippo) in Vision Transformers (ViTs) for resilient lifelong learning. With a comprehensive ablation study, the final linear projection layer in the multi-head self-attention (MHSA) block is selected in realizing and growing ArtiHippo. ArtiHippo is represented by a mixture of experts (MoEs). Each expert component is an on-site variant of the linear projection layer, which is maintained via neural architecture search (NAS) with the search space defined by four basic growing operations – skip, reuse, adapt, and new in lifelong learning. The LM of a task consists of two parts: the dedicated expert components (as model parameters) at different layers of a ViT learned via NAS, and the mean class-tokens (as stored latent vectors for measuring task similarity) associated with the expert components. For a new task, a hierarchical task-similarity-oriented exploration-exploitation sampling based NAS is proposed to learn the expert components. The task similarity is measured based on the normalized cosine similarity between the mean class-token of the new task and those of old tasks. The proposed method is complementary to prompt-based lifelong learning with ViTs. In experiments, the proposed method is tested on the challenging Visual Domain Decathlon (VDD) benchmark and the recently proposed 5-Dataset benchmark. It obtains consistently better performance than the prior art with sensible ArtiHippo learned continually
Xue, Nan; Tan, Bin; Xiao, Yuxi; Dong, Liang; Xia, Gui-Song; Wu, Tianfu
Volumetric Wireframe Parsing from Neural Attraction Fields Online
2023, visited: 21.07.2023.
@online{NEAT,
title = {Volumetric Wireframe Parsing from Neural Attraction Fields},
author = {Nan Xue and Bin Tan and Yuxi Xiao and Liang Dong and Gui-Song Xia and Tianfu Wu},
url = {https://arxiv.org/abs/2307.10206},
year = {2023},
date = {2023-07-21},
urldate = {2023-07-21},
abstract = {The primal sketch is a fundamental representation in Marr's vision theory, which allows for parsimonious image-level processing from 2D to 2.5D perception. This paper takes a further step by computing 3D primal sketch of wireframes from a set of images with known camera poses, in which we take the 2D wireframes in multi-view images as the basis to compute 3D wireframes in a volumetric rendering formulation. In our method, we first propose a NEural Attraction (NEAT) Fields that parameterizes the 3D line segments with coordinate Multi-Layer Perceptrons (MLPs), enabling us to learn the 3D line segments from 2D observation without incurring any explicit feature correspondences across views. We then present a novel Global Junction Perceiving (GJP) module to perceive meaningful 3D junctions from the NEAT Fields of 3D line segments by optimizing a randomly initialized high-dimensional latent array and a lightweight decoding MLP. Benefitting from our explicit modeling of 3D junctions, we finally compute the primal sketch of 3D wireframes by attracting the queried 3D line segments to the 3D junctions, significantly simplifying the computation paradigm of 3D wireframe parsing. In experiments, we evaluate our approach on the DTU and BlendedMVS datasets with promising performance obtained. As far as we know, our method is the first approach to achieve high-fidelity 3D wireframe parsing without requiring explicit matching.},
howpublished = {arXiv preprint},
keywords = {},
pubstate = {published},
tppubtype = {online}
}
Reza, Md Farhamdur; Rahmati, Ali; Wu, Tianfu; Dai, Huaiyu
CGBA: Curvature-aware Geometric Black-box Attack Proceedings
in: ICCV'23, 2023.
@proceedings{CGBA,
title = {CGBA: Curvature-aware Geometric Black-box Attack},
author = {Md Farhamdur Reza and Ali Rahmati and Tianfu Wu and Huaiyu Dai},
url = {https://arxiv.org/abs/2308.03163},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
abstract = {Decision-based black-box attacks often necessitate a large number of queries to craft an adversarial example.
Moreover, decision-based attacks based on querying boundary points in the estimated normal vector direction often suffer from inefficiency and convergence issues.
In this paper, we propose a novel query-efficient b curvature-aware b geometric decision-based b black-box b attack (CGBA) that conducts boundary search along a semicircular path on a restricted 2D plane to ensure finding a boundary point successfully irrespective of the boundary curvature.
While the proposed CGBA attack can work effectively for an arbitrary decision boundary, it is particularly efficient in exploiting the low curvature to craft high-quality adversarial examples, which is widely seen and experimentally verified in commonly used classifiers under non-targeted attacks. In contrast, the decision boundaries often exhibit higher curvature under targeted attacks.
Thus, we develop a new query-efficient variant, CGBA-H, that is adapted for the targeted attack.
In addition, we further design an algorithm to obtain a better initial boundary point at the expense of some extra queries, which considerably enhances the performance of the targeted attack. Extensive experiments are conducted to evaluate the performance of our proposed methods against some well-known classifiers on the ImageNet and CIFAR10 datasets, demonstrating the superiority of CGBA and CGBA-H over state-of-the-art non-targeted and targeted attacks, respectively.
The source code is available at url{https://github.com/Farhamdur/CGBA}.},
howpublished = {in: ICCV'23},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
Moreover, decision-based attacks based on querying boundary points in the estimated normal vector direction often suffer from inefficiency and convergence issues.
In this paper, we propose a novel query-efficient b curvature-aware b geometric decision-based b black-box b attack (CGBA) that conducts boundary search along a semicircular path on a restricted 2D plane to ensure finding a boundary point successfully irrespective of the boundary curvature.
While the proposed CGBA attack can work effectively for an arbitrary decision boundary, it is particularly efficient in exploiting the low curvature to craft high-quality adversarial examples, which is widely seen and experimentally verified in commonly used classifiers under non-targeted attacks. In contrast, the decision boundaries often exhibit higher curvature under targeted attacks.
Thus, we develop a new query-efficient variant, CGBA-H, that is adapted for the targeted attack.
In addition, we further design an algorithm to obtain a better initial boundary point at the expense of some extra queries, which considerably enhances the performance of the targeted attack. Extensive experiments are conducted to evaluate the performance of our proposed methods against some well-known classifiers on the ImageNet and CIFAR10 datasets, demonstrating the superiority of CGBA and CGBA-H over state-of-the-art non-targeted and targeted attacks, respectively.
The source code is available at url{https://github.com/Farhamdur/CGBA}.
Liu, Xianpeng; Zheng, Ce; Cheng, Kelvin; Xue, Nan; Qi, Guo-Jun; Wu, Tianfu
Monocular 3D Object Detection with Bounding Box Denoising in 3D by Perceiver Proceedings
in: ICCV'23, 2023.
@proceedings{monoxiver,
title = {Monocular 3D Object Detection with Bounding Box Denoising in 3D by Perceiver},
author = {Xianpeng Liu and Ce Zheng and Kelvin Cheng and Nan Xue and Guo-Jun Qi and Tianfu Wu},
url = {https://arxiv.org/abs/2304.01289},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
abstract = {The main challenge of monocular 3D object detection is the accurate localization of 3D center. Motivated by a new and strong observation that this challenge can be remedied by a 3D-space local-grid search scheme in an ideal case, we propose a stage-wise approach, which combines the information flow from 2D-to-3D (3D bounding box proposal generation with a single 2D image) and 3D-to-2D (proposal verification by denoising with 3D-to-2D contexts) in a top-down manner. Specifically, we first obtain initial proposals from off-the-shelf backbone monocular 3D detectors. Then, we generate a 3D anchor space by local-grid sampling from the initial proposals. Finally, we perform 3D bounding box denoising at the 3D-to-2D proposal verification stage. To effectively learn discriminative features for denoising highly overlapped proposals, this paper presents a method of using the Perceiver I/O model to fuse the 3D-to-2D geometric information and the 2D appearance information. With the encoded latent representation of a proposal, the verification head is implemented with a self-attention module. Our method, named as MonoXiver, is generic and can be easily adapted to any backbone monocular 3D detectors. Experimental results on the well-established KITTI dataset and the challenging large-scale Waymo dataset show that MonoXiver consistently achieves improvement with limited computation overhead.},
howpublished = {in: ICCV'23},
keywords = {},
pubstate = {published},
tppubtype = {proceedings}
}
2022
Foster, Marc; Wu, Tianfu; Roberts, David L.; Bozkurt, Alper
Preliminary Evaluation of a System with On-Body and Aerial Sensors for Monitoring Working Dogs Journal Article
In: Sensors, vol. 22, no. 19, pp. 7631, 2022.
@article{DBLP:journals/sensors/FosterWRB22,
title = {Preliminary Evaluation of a System with On-Body and Aerial Sensors
for Monitoring Working Dogs},
author = {Marc Foster and Tianfu Wu and David L. Roberts and Alper Bozkurt},
url = {https://doi.org/10.3390/s22197631},
doi = {10.3390/S22197631},
year = {2022},
date = {2022-01-01},
journal = {Sensors},
volume = {22},
number = {19},
pages = {7631},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Liu, Xianpeng; Xue, Nan; Wu, Tianfu
Learning Auxiliary Monocular Contexts Helps Monocular 3D Object Detection Proceedings Article
In: AAAI 2022, 2022.
@inproceedings{MonoCon,
title = {Learning Auxiliary Monocular Contexts Helps Monocular 3D Object Detection},
author = {Xianpeng Liu and Nan Xue and Tianfu Wu},
url = {https://arxiv.org/abs/2112.04628},
year = {2022},
date = {2022-02-22},
urldate = {2022-02-22},
booktitle = {AAAI 2022},
abstract = {Monocular 3D object detection aims to localize 3D bounding boxes in an input single 2D image. It is a highly challenging problem and remains open, especially when no extra information (e.g., depth, lidar and/or multi-frames) can be leveraged in training and/or inference. This paper proposes a simple yet effective formulation for monocular 3D object detection without exploiting any extra information. It presents the MonoCon method which learns Monocular Contexts, as auxiliary tasks in training, to help monocular 3D object detection. The key idea is that with the annotated 3D bounding boxes of objects in an image, there is a rich set of well-posed projected 2D supervision signals available in training, such as the projected corner keypoints and their associated offset vectors with respect to the center of 2D bounding box, which should be exploited as auxiliary tasks in training. The proposed MonoCon is motivated by the Cramer-Wold theorem in measure theory at a high level. In implementation, it utilizes a very simple end-to-end design to justify the effectiveness of learning auxiliary monocular contexts, which consists of three components: a Deep Neural Network (DNN) based feature backbone, a number of regression head branches for learning the essential parameters used in the 3D bounding box prediction, and a number of regression head branches for learning auxiliary contexts. After training, the auxiliary context regression branches are discarded for better inference efficiency. In experiments, the proposed MonoCon is tested in the KITTI benchmark (car, pedestrain and cyclist). It outperforms all prior arts in the leaderboard on car category and obtains comparable performance on pedestrian and cyclist in terms of accuracy. Thanks to the simple design, the proposed MonoCon method obtains the fastest inference speed with 38.7 fps in comparisons},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xue, Nan; Wu, Tianfu; Xia, Gui-Song; Zhang, Liangpei
Learning Local-Global Contextual Adaptation for Multi-Person Pose Estimation Proceedings Article
In: CVPR 2022, 2022.
@inproceedings{LOGOCAPb,
title = {Learning Local-Global Contextual Adaptation for Multi-Person Pose Estimation},
author = {Nan Xue and Tianfu Wu and Gui-Song Xia and Liangpei Zhang},
url = {https://arxiv.org/abs/2109.03622},
year = {2022},
date = {2022-06-21},
urldate = {2022-06-21},
booktitle = {CVPR 2022},
abstract = {This paper studies the problem of multi-person pose estimation in a bottom-up fashion. With a new and strong observation that the localization issue of the center-offset formulation can be remedied in a local-window search scheme in an ideal situation, we propose a multi-person pose estimation approach, dubbed as LOGO-CAP, by learning the LOcal-GlObal Contextual Adaptation for human Pose. Specifically, our approach learns the keypoint attraction maps (KAMs) from the local keypoints expansion maps (KEMs) in small local windows in the first step, which are subsequently treated as dynamic convolutional kernels on the keypoints-focused global heatmaps for contextual adaptation, achieving accurate multi-person pose estimation. Our method is end-to-end trainable with near real-time inference speed in a single forward pass, obtaining state-of-the-art performance on the COCO keypoint benchmark for bottom-up human pose estimation. With the COCO trained model, our method also outperforms prior arts by a large margin on the challenging OCHuman dataset.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jiang, Bo; Krim, Hamid; Wu, Tianfu; Cansever, Derya
Refining Self-Supervised Learning in Imaging: Beyond Linear Metric Proceedings Article
In: 2022 IEEE International Conference on Image Processing, ICIP 2022, Bordeaux, France, 16-19 October 2022, pp. 76–80, 2022.
@inproceedings{DBLP:conf/icip/JiangK0C22,
title = {Refining Self-Supervised Learning in Imaging: Beyond Linear Metric},
author = {Bo Jiang and Hamid Krim and Tianfu Wu and Derya Cansever},
url = {https://doi.org/10.1109/ICIP46576.2022.9897745},
doi = {10.1109/ICIP46576.2022.9897745},
year = {2022},
date = {2022-01-01},
booktitle = {2022 IEEE International Conference on Image Processing, ICIP 2022,
Bordeaux, France, 16-19 October 2022},
pages = {76\textendash80},
crossref = {DBLP:conf/icip/2022},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Cheng, Kelvin; Wu, Tianfu; Healey, Christopher
Revisiting Non-Parametric Matching Cost Volumes for Robust and Generalizable Stereo Matching Proceedings Article
In: NeurIPS 2022, 2022.
@inproceedings{Cheng2022,
title = {Revisiting Non-Parametric Matching Cost Volumes for Robust and Generalizable Stereo Matching},
author = {Kelvin Cheng and Tianfu Wu and Christopher Healey},
url = {https://openreview.net/forum?id=WXdSp8k0TMn},
year = {2022},
date = {2022-11-30},
urldate = {2022-11-30},
booktitle = {NeurIPS 2022},
abstract = {Stereo matching is a classic challenging problem in computer vision, which has recently witnessed remarkable progress by Deep Neural Networks (DNNs). This paradigm shift leads to two interesting and entangled questions that have not been addressed well. First, it is unclear whether stereo matching DNNs that are trained from scratch really learn to perform matching well. This paper studies this problem from the lens of white-box adversarial attacks. It presents a method of learning stereo-constrained photometrically-consistent attacks, which by design are weaker adversarial attacks, and yet can cause catastrophic performance drop for those DNNs. This observation suggests that they may not actually learn to perform matching well in the sense that they should otherwise achieve potentially even better after stereo-constrained perturbations are introduced. Second, stereo matching DNNs are typically trained under the simulation-to-real (Sim2Real) pipeline due to the data hungriness of DNNs. Thus, alleviating the impacts of the Sim2Real photometric gap in stereo matching DNNs becomes a pressing need. Towards joint adversarially robust and domain generalizable stereo matching, this paper proposes to learn DNN-contextualized binary-pattern-driven non-parametric cost-volumes. It leverages the perspective of learning the cost aggregation via DNNs, and presents a simple yet expressive design that is fully end-to-end trainable, without resorting to specific aggregation inductive biases. In experiments, the proposed method is tested in the SceneFlow dataset, the KITTI2015 dataset, and the Middlebury dataset. It significantly improves the adversarial robustness, while retaining accuracy performance comparable to state-of-the-art methods. It also shows a better Sim2Real generalizability. Our code and pretrained models are released athttps://github.com/kelkelcheng/AdversariallyRobustStereo},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Kashyap, Priyank; Gajjar, Archit; Choi, Yongjin; Wong, Chau-Wai; Baron, Dror; Wu, Tianfu; Cheng, Chris; Franzon, Paul D.
RxGAN: Modeling High-Speed Receiver through Generative Adversarial Networks Proceedings Article
In: 2022 ACM/IEEE Workshop on Machine Learning for CAD, MLCAD 2022, Virtual Event, China, September 12-13, 2022, pp. 167–172, 2022.
@inproceedings{DBLP:conf/mlcad/KashyapGCWBWCF22,
title = {RxGAN: Modeling High-Speed Receiver through Generative Adversarial
Networks},
author = {Priyank Kashyap and Archit Gajjar and Yongjin Choi and Chau-Wai Wong and Dror Baron and Tianfu Wu and Chris Cheng and Paul D. Franzon},
url = {https://doi.org/10.1145/3551901.3556480},
doi = {10.1145/3551901.3556480},
year = {2022},
date = {2022-01-01},
booktitle = {2022 ACM/IEEE Workshop on Machine Learning for CAD, MLCAD 2022,
Virtual Event, China, September 12-13, 2022},
pages = {167\textendash172},
crossref = {DBLP:conf/mlcad/2022},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Shen, Jianghao; Wu, Tianfu
Learning Inception Attention for Image Synthesis and Image Recognition Online
2022, visited: 01.02.2022.
@online{SLIM,
title = {Learning Inception Attention for Image Synthesis and Image Recognition},
author = {Jianghao Shen and Tianfu Wu},
url = {https://arxiv.org/abs/2112.14804},
year = {2022},
date = {2022-02-01},
urldate = {2022-02-01},
abstract = {Image synthesis and image recognition have witnessed remarkable progress, but often at the expense of computationally expensive training and inference. Learning lightweight yet expressive deep model has emerged as an important and interesting direction. Inspired by the well-known split-transform-aggregate design heuristic in the Inception building block, this paper proposes a Skip-Layer Inception Module (SLIM) that facilitates efficient learning of image synthesis models, and a same-layer variant (dubbed as SLIM too) as a stronger alternative to the well-known ResNeXts for image recognition. In SLIM, the input feature map is first split into a number of groups (e.g., 4).Each group is then transformed to a latent style vector(via channel-wise attention) and a latent spatial mask (via spatial attention). The learned latent masks and latent style vectors are aggregated to modulate the target feature map. For generative learning, SLIM is built on a recently proposed lightweight Generative Adversarial Networks (i.e., FastGANs) which present a skip-layer excitation(SLE) module. For few-shot image synthesis tasks, the proposed SLIM achieves better performance than the SLE work and other related methods. For one-shot image synthesis tasks, it shows stronger capability of preserving images structures than prior arts such as the SinGANs. For image classification tasks, the proposed SLIM is used as a drop-in replacement for convolution layers in ResNets (resulting in ResNeXt-like models) and achieves better accuracy in theImageNet-1000 dataset, with significantly smaller model complexity},
howpublished = {arXiv preprint},
keywords = {},
pubstate = {published},
tppubtype = {online}
}
2021
Li, Meihui; Peng, Lingbing; Wu, Tianfu; Peng, Zhenming
A Bottom-Up and Top-Down Integration Framework for Online Object Tracking Journal Article
In: IEEE Trans. Multim., vol. 23, pp. 105–119, 2021.
@article{DBLP:journals/tmm/LiPWP21,
title = {A Bottom-Up and Top-Down Integration Framework for Online Object Tracking},
author = {Meihui Li and Lingbing Peng and Tianfu Wu and Zhenming Peng},
url = {https://doi.org/10.1109/TMM.2020.2978623},
doi = {10.1109/TMM.2020.2978623},
year = {2021},
date = {2021-01-01},
journal = {IEEE Trans. Multim.},
volume = {23},
pages = {105\textendash119},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Roheda, Siddharth; Krim, Hamid; Luo, Zhi-Quan; Wu, Tianfu
Event driven sensor fusion Journal Article
In: Signal Process., vol. 188, pp. 108241, 2021.
@article{DBLP:journals/sigpro/RohedaKLW21,
title = {Event driven sensor fusion},
author = {Siddharth Roheda and Hamid Krim and Zhi-Quan Luo and Tianfu Wu},
url = {https://doi.org/10.1016/j.sigpro.2021.108241},
doi = {10.1016/J.SIGPRO.2021.108241},
year = {2021},
date = {2021-01-01},
journal = {Signal Process.},
volume = {188},
pages = {108241},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Sun, Wei; Wu, Tianfu
Learning Layout and Style Reconfigurable GANs for Controllable Image Synthesis Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI), 2021.
@article{LostGANs,
title = {Learning Layout and Style Reconfigurable GANs for Controllable Image Synthesis},
author = {Wei Sun and Tianfu Wu},
url = {https://arxiv.org/abs/2003.11571},
doi = {10.1109/TPAMI.2021.3078577},
year = {2021},
date = {2021-05-01},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)},
abstract = {With the remarkable recent progress on learning deep generative models, it becomes increasingly interesting to develop models for controllable image synthesis from reconfigurable inputs. This paper focuses on a recent emerged task, layout-to-image, to learn generative models that are capable of synthesizing photo-realistic images from spatial layout (i.e., object bounding boxes configured in an image lattice) and style (i.e., structural and appearance variations encoded by latent vectors). This paper first proposes an intuitive paradigm for the task, layout-to-mask-to-image, to learn to unfold object masks of given bounding boxes in an input layout to bridge the gap between the input layout and synthesized images. Then, this paper presents a method built on Generative Adversarial Networks for the proposed layout-to-mask-to-image with style control at both image and mask levels. Object masks are learned from the input layout and iteratively refined along stages in the generator network. Style control at the image level is the same as in vanilla GANs, while style control at the object mask level is realized by a proposed novel feature normalization scheme, Instance-Sensitive and Layout-Aware Normalization. In experiments, the proposed method is tested in the COCO-Stuff dataset and the Visual Genome dataset with state-of-the-art performance obtained.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Tan, Bin; Xue, Nan; Bai, Song; Wu, Tianfu; Xia, Gui-Song
PlaneTR: Structure-Guided Transformers for 3D Plane Recovery Proceedings Article
In: ICCV, 2021.
@inproceedings{planeTR,
title = {PlaneTR: Structure-Guided Transformers for 3D Plane Recovery},
author = {Bin Tan and Nan Xue and Song Bai and Tianfu Wu and Gui-Song Xia},
url = {https://arxiv.org/pdf/2107.13108.pdf},
year = {2021},
date = {2021-10-11},
booktitle = {ICCV},
abstract = {This paper presents a neural network built upon Transformers, namely PlaneTR, to simultaneously detect and reconstruct planes from a single image. Different from previous methods, PlaneTR jointly leverages the context information and the geometric structures in a sequence-to-sequence way to holistically detect plane instances in one forward pass. Specifically, we represent the geometric structures as line segments and conduct the network with three main components: (i) context and line segments encoders, (ii) a structure-guided plane decoder, (iii) a pixelwise plane embedding decoder. Given an image and its detected line segments, PlaneTR generates the context and line segment sequences via two specially designed encoders and then feeds them into a Transformers-based decoder to directly predict a sequence of plane instances by simultaneously considering the context and global structure cues. Finally, the pixel-wise embeddings are computed to assign each pixel to one predicted plane instance which is nearest to it in embedding space. Comprehensive experiments demonstrate that PlaneTR achieves a state-of-the-art performance on the ScanNet and NYUv2 datasets.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Sun, Wei; Wu, Tianfu
Deep Consensus Learning Online
arXiv preprint 2021.
@online{DCL,
title = {Deep Consensus Learning},
author = {Wei Sun and Tianfu Wu},
url = {https://arxiv.org/abs/2103.08475},
year = {2021},
date = {2021-03-15},
organization = {arXiv preprint},
abstract = {Both generative learning and discriminative learning have recently witnessed remarkable progress using Deep Neural Networks (DNNs). For structured input synthesis and structured output prediction problems (e.g., layout-to-image synthesis and image semantic segmentation respectively), they often are studied separately. This paper proposes deep consensus learning (DCL) for joint layout-to-image synthesis and weakly-supervised image semantic segmentation. The former is realized by a recently proposed LostGAN approach, and the latter by introducing an inference network as the third player joining the two-player game of LostGAN. Two deep consensus mappings are exploited to facilitate training the three networks end-to-end: Given an input layout (a list of object bounding boxes), the generator generates a mask (label map) and then use it to help synthesize an image. The inference network infers the mask for the synthesized image. Then, the latent consensus is measured between the mask generated by the generator and the one inferred by the inference network. For the real image corresponding to the input layout, its mask also is computed by the inference network, and then used by the generator to reconstruct the real image. Then, the data consensus is measured between the real image and its reconstructed image. The discriminator still plays the role of an adversary by computing the realness scores for a real image, its reconstructed image and a synthesized image. In experiments, our DCL is tested in the COCO-Stuff dataset. It obtains compelling layout-to-image synthesis results and weakly-supervised image semantic segmentation results.},
keywords = {},
pubstate = {published},
tppubtype = {online}
}
Xue, Nan; Wu, Tianfu; Zhang, Zhen; Xia, Gui-Song
Learning Local-Global Contextual Adaptation for Fully End-to-End Bottom-Up Human Pose Estimation Online
2021, visited: 08.09.2021.
@online{LOGOCAP,
title = {Learning Local-Global Contextual Adaptation for Fully End-to-End Bottom-Up Human Pose Estimation},
author = {Nan Xue and Tianfu Wu and Zhen Zhang and Gui-Song Xia},
url = {https://arxiv.org/abs/2109.03622},
year = {2021},
date = {2021-09-08},
urldate = {2021-09-08},
abstract = {This paper presents a method of learning Local-GlObal Contextual Adaptation for fully end-to-end and fast bottom-up human Pose estimation, dubbed as LOGO-CAP. It is built on the conceptually simple center-offset formulation that lacks inaccuracy for pose estimation. When revisiting the bottom-up human pose estimation with the thought of "thinking, fast and slow" by D. Kahneman, we introduce a "slow keypointer" to remedy the lack of sufficient accuracy of the "fast keypointer". In learning the "slow keypointer", the proposed LOGO-CAP lifts the initial "fast" keypoints by offset predictions to keypoint expansion maps (KEMs) to counter their uncertainty in two modules. Firstly, the local KEMs (e.g., 11x11) are extracted from a low-dimensional feature map. A proposed convolutional message passing module learns to "re-focus" the local KEMs to the keypoint attraction maps (KAMs) by accounting for the structured output prediction nature of human pose estimation, which is directly supervised by the object keypoint similarity (OKS) loss in training. Secondly, the global KEMs are extracted, with a sufficiently large region-of-interest (e.g., 97x97), from the keypoint heatmaps that are computed by a direct map-to-map regression. Then, a local-global contextual adaptation module is proposed to convolve the global KEMs using the learned KAMs as the kernels. This convolution can be understood as the learnable offsets guided deformable and dynamic convolution in a pose-sensitive way. The proposed method is end-to-end trainable with near real-time inference speed, obtaining state-of-the-art performance on the COCO keypoint benchmark for bottom-up human pose estimation. With the COCO trained model, our LOGO-CAP also outperforms prior arts by a large margin on the challenging OCHuman dataset.},
keywords = {},
pubstate = {published},
tppubtype = {online}
}
Cheng, Kelvin; Healey, Christopher; Wu, Tianfu
arXiv preprint 2021.
@online{robustStereo,
title = {Towards Adversarially Robust and Domain Generalizable Stereo Matching by Rethinking DNN Feature Backbones},
author = {Kelvin Cheng and Christopher Healey and Tianfu Wu},
url = {https://arxiv.org/abs/2108.00335},
year = {2021},
date = {2021-08-03},
organization = {arXiv preprint},
abstract = {Stereo matching has recently witnessed remarkable progress using Deep Neural Networks (DNNs). But, how robust are they? Although it has been well-known that DNNs often suffer from adversarial vulnerability with a catastrophic drop in performance, the situation is even worse in stereo matching. This paper first shows that a type of weak white-box attacks can fail state-of-the-art methods. The attack is learned by a proposed stereo-constrained projected gradient descent (PGD) method in stereo matching. This observation raises serious concerns for the deployment of DNN-based stereo matching. Parallel to the adversarial vulnerability, DNN-based stereo matching is typically trained under the so-called simulation to reality pipeline, and thus domain generalizability is an important problem. This paper proposes to rethink the learnable DNN-based feature backbone towards adversarially-robust and domain generalizable stereo matching, either by completely removing it or by applying it only to the left reference image. It computes the matching cost volume using the classic multi-scale census transform (i.e., local binary pattern) of the raw input stereo images, followed by a stacked Hourglass head sub-network solving the matching problem. In experiments, the proposed method is tested in the SceneFlow dataset and the KITTI2015 benchmark. It significantly improves the adversarial robustness, while retaining accuracy performance comparable to state-of-the-art methods. It also shows better generalizability from simulation (SceneFlow) to real (KITTI) datasets when no fine-tuning is used.},
keywords = {},
pubstate = {published},
tppubtype = {online}
}
2020
Xue, Nan; Bai, Song; Wang, Fu-Dong; Xia, Gui-Song; Wu, Tianfu; Zhang, Liangpei; Torr, Philip H. S.
Learning Regional Attraction for Line Segment Detection Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI), 2020, ISSN: 0162-8828.
@article{RegionalAttractionLSD,
title = {Learning Regional Attraction for Line Segment Detection},
author = {Nan Xue and Song Bai and Fu-Dong Wang and Gui-Song Xia and Tianfu Wu and Liangpei Zhang and Philip H.S. Torr},
url = {https://arxiv.org/abs/1912.09344},
doi = {10.1109/TPAMI.2019.2958642},
issn = {0162-8828},
year = {2020},
date = {2020-01-01},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)},
abstract = {This paper presents regional attraction of line segment maps, and hereby poses the problem of line segment detection (LSD) as a problem of region coloring. Given a line segment map, the proposed regional attraction first establishes the relationship between line segments and regions in the image lattice. Based on this, the line segment map is equivalently transformed to an attraction field map (AFM), which can be remapped to a set of line segments without loss of information. Accordingly, we develop an end-to-end framework to learn attraction field maps for raw input images, followed by a squeeze module to detect line segments. Apart from existing works, the proposed detector properly handles the local ambiguity and does not rely on the accurate identification of edge pixels. Comprehensive experiments on the Wireframe dataset and the YorkUrban dataset demonstrate the superiority of our method. In particular, we achieve an F-measure of 0.831 on the Wireframe dataset, advancing the state-of-the-art performance by 10.3 percent.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Li, Xilai; Sun, Wei; Wu, Tianfu
Attentive Normalization Proceedings Article
In: European Conference on Computer Vision (ECCV), 2020.
@inproceedings{AttnNorm,
title = {Attentive Normalization},
author = {Xilai Li and Wei Sun and Tianfu Wu},
url = {https://arxiv.org/abs/1908.01259},
year = {2020},
date = {2020-08-23},
booktitle = {European Conference on Computer Vision (ECCV)},
journal = {CoRR},
volume = {abs/1905.10695},
abstract = {Batch Normalization (BN) is a vital pillar in the development of deep learning with many recent variations such as Group Normalization (GN) and Switchable Normalization. Channel-wise feature attention methods such as the squeeze-and-excitation (SE) unit have also shown impressive performance improvement. BN and its variants take into account different ways of computing the mean and variance within a min-batch for feature normalization, followed by a learnable channel-wise affine transformation. SE explicitly learns how to adaptively recalibrate channel-wise feature responses. They have been studied separately, however. In this paper, we propose a novel and lightweight integration of feature normalization and feature channel-wise attention. We present Attentive Normalization (AN) as a simple and unified alternative. AN absorbs SE into the affine transformation of BN. AN learns a small number of scale and offset parameters per channel (i.e., different affine transformations). Their weighted sums (i.e., mixture) are used in the final affine transformation. The weights are instance-specific and learned in a way that channel-wise attention is considered, similar in spirit to the squeeze module in the SE unit. AN is complementary and applicable to existing variants of BN. In experiments, we test AN in the ImageNet-1K classification dataset and the MS-COCO object detection and instance segmentation dataset with significantly better performance obtained than the vanilla BN. Our AN also outperforms two state-of-the-art variants of BN, GN and SN.},
howpublished = {arXiv preprint},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xue, Nan; Wu, Tianfu; Bai, Song; Wang, Fudong; Xia, Gui-Song; Zhang, Liangpei; Torr, Philip H. S.
Holistically-Attracted Wireframe Parsing Proceedings Article
In: IEEE Conference on Computer Vision and Pattern Recognition (CVRP), 2020., 2020.
@inproceedings{HAWP,
title = {Holistically-Attracted Wireframe Parsing},
author = {Nan Xue and Tianfu Wu and Song Bai and Fudong Wang and Gui-Song Xia and Liangpei Zhang and Philip H.S. Torr},
year = {2020},
date = {2020-02-23},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVRP), 2020.},
abstract = {This paper presents a fast and parsimonious parsing method to accurately and robustly detect a vectorized wireframe in an input image with a single forward pass. The proposed method is end-to-end trainable, consisting of three components: (i) line segment and junction proposal generation, (ii) line segment and junction matching, and (iii) line segment and junction verification.
For computing line segment proposals, a novel exact dual representation is proposed which exploits a parsimonious geometric reparameterization for line segments and forms a holistic 4-dimensional attraction field map for an input image. Junctions can be treated as the ``basins" in the attraction field. The proposed method is thus called Holistically-Attracted Wireframe Parser (HAWP). In experiments, the proposed method is tested on two benchmarks, the Wireframe dataset and the YorkUrban dataset. On both benchmarks, it obtains state-of-the-art performance in terms of accuracy and efficiency. For example, on the Wireframe dataset, compared to the previous state-of-the-art method L-CNN, it improves the challenging mean structural average precision (msAP) by a large margin ($2.8%$ absolute improvements), and achieves 29.5 FPS on a single GPU (89% relative improvement). A systematic ablation study is performed to further justify the proposed method. },
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
For computing line segment proposals, a novel exact dual representation is proposed which exploits a parsimonious geometric reparameterization for line segments and forms a holistic 4-dimensional attraction field map for an input image. Junctions can be treated as the ``basins" in the attraction field. The proposed method is thus called Holistically-Attracted Wireframe Parser (HAWP). In experiments, the proposed method is tested on two benchmarks, the Wireframe dataset and the YorkUrban dataset. On both benchmarks, it obtains state-of-the-art performance in terms of accuracy and efficiency. For example, on the Wireframe dataset, compared to the previous state-of-the-art method L-CNN, it improves the challenging mean structural average precision (msAP) by a large margin ($2.8%$ absolute improvements), and achieves 29.5 FPS on a single GPU (89% relative improvement). A systematic ablation study is performed to further justify the proposed method.
Chen, Zexi; Dutton, Benjamin; Ramachandra, Bharathkumar; Wu, Tianfu; Vatsavai, Ranga Raju
Local Clustering with Mean Teacher for Semi-supervised learning Proceedings Article
In: 25th International Conference on Pattern Recognition, ICPR 2020, Virtual Event / Milan, Italy, January 10-15, 2021, pp. 6243–6250, 2020.
@inproceedings{DBLP:conf/icpr/ChenDRWV20,
title = {Local Clustering with Mean Teacher for Semi-supervised learning},
author = {Zexi Chen and Benjamin Dutton and Bharathkumar Ramachandra and Tianfu Wu and Ranga Raju Vatsavai},
url = {https://doi.org/10.1109/ICPR48806.2021.9412469},
doi = {10.1109/ICPR48806.2021.9412469},
year = {2020},
date = {2020-01-01},
booktitle = {25th International Conference on Pattern Recognition, ICPR 2020,
Virtual Event / Milan, Italy, January 10-15, 2021},
pages = {6243\textendash6250},
crossref = {DBLP:conf/icpr/2020},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xing, Xianglei; Wu, Tianfu; Zhu, Song-Chun; Wu, Ying Nian
Towards Interpretable Image Synthesis by Learning Sparsely Connected AND-OR Networks Proceedings Article
In: IEEE Conference on Computer Vision and Pattern Recognition (CVRP), 2020., 2020.
@inproceedings{iGenerativeM,
title = {Towards Interpretable Image Synthesis by Learning Sparsely Connected AND-OR Networks},
author = {Xianglei Xing and Tianfu Wu and Song-Chun Zhu and Ying Nian Wu},
url = {https://arxiv.org/abs/1909.04324},
year = {2020},
date = {2020-02-23},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVRP), 2020.},
journal = {CoRR},
abstract = {This paper proposes interpretable image synthesis by learning hierarchical AND-OR networks of sparsely connected semantically meaningful nodes. The proposed method is based on the compositionality and interpretability of scene-objects-parts-subparts-primitives hierarchy in image representation. A scene has different types (i.e., OR) each of which consists of a number of objects (i.e., AND). This can be recursively formulated across the scene-objects-parts-subparts hierarchy and is terminated at the primitive level (e.g., Gabor wavelets-like basis). To realize this interpretable AND-OR hierarchy in image synthesis, the proposed method consists of two components: (i) Each layer of the hierarchy is represented by an over-completed set of basis functions. The basis functions are instantiated using convolution to be translation covariant. Off-the-shelf convolutional neural architectures are then exploited to implement the hierarchy. (ii) Sparsity-inducing constraints are introduced in end-to-end training, which facilitate a sparsely connected AND-OR network to emerge from initially densely connected convolutional neural networks. A straightforward sparsity-inducing constraint is utilized, that is to only allow the top-k basis functions to be active at each layer (where k is a hyperparameter). The learned basis functions are also capable of image reconstruction to explain away input images. In experiments, the proposed method is tested on five benchmark datasets. The results show that meaningful and interpretable hierarchical representations are learned with better qualities of image synthesis and reconstruction obtained than state-of-the-art baselines.},
howpublished = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2020},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Sun, Wei; Wu, Tianfu
Learning Layout and Style Reconfigurable GANs for Controllable Image Synthesis Online
2020.
@online{LostGANv2,
title = {Learning Layout and Style Reconfigurable GANs for Controllable Image Synthesis},
author = {Wei Sun and Tianfu Wu},
url = {https://arxiv.org/abs/2003.11571},
year = {2020},
date = {2020-04-03},
abstract = {With the remarkable recent progress on learning deep generative models, it becomes increasingly interesting to develop models for controllable image synthesis from reconfigurable inputs. This paper focuses on a recent emerged task, layout-to-image, to learn generative models that are capable of synthesizing photo-realistic images from spatial layout (i.e., object bounding boxes configured in an image lattice) and style (i.e., structural and appearance variations encoded by latent vectors). This paper first proposes an intuitive paradigm for the task, layout-to-mask-to-image, to learn to unfold object masks of given bounding boxes in an input layout to bridge the gap between the input layout and synthesized images. Then, this paper presents a method built on Generative Adversarial Networks for the proposed layout-to-mask-to-image with style control at both image and mask levels. Object masks are learned from the input layout and iteratively refined along stages in the generator network. Style control at the image level is the same as in vanilla GANs, while style control at the object mask level is realized by a proposed novel feature normalization scheme, Instance-Sensitive and Layout-Aware Normalization. In experiments, the proposed method is tested in the COCO-Stuff dataset and the Visual Genome dataset with state-of-the-art performance obtained.},
keywords = {},
pubstate = {published},
tppubtype = {online}
}
2019
Xie, Zhao; Wu, Tianfu; Yang, Xingming; Zhang, Luming; Wu, Kewei
Jointly social grouping and identification in visual dynamics with causality-induced hierarchical Bayesian model Journal Article
In: J. Visual Communication and Image Representation, vol. 59, pp. 62–75, 2019.
@article{SocialGroupingId,
title = {Jointly social grouping and identification in visual dynamics with
causality-induced hierarchical Bayesian model},
author = {Zhao Xie and Tianfu Wu and Xingming Yang and Luming Zhang and Kewei Wu},
url = {https://www.sciencedirect.com/science/article/pii/S1047320319300057},
doi = {10.1016/j.jvcir.2019.01.006},
year = {2019},
date = {2019-02-01},
journal = {J. Visual Communication and Image Representation},
volume = {59},
pages = {62--75},
abstract = {We concentrate on modeling the person-person interactions for group activity recognition. In order to solve the complexity and ambiguity problems caused by a large number of human objects, we propose a causality-induced hierarchical Bayesian model to tackle the interaction activity video, referring to the “what” interaction activities happen, “where” interaction atomic occurs in spatial, and “when” group interaction happens in temporal. In particular, Granger Causality has been characterized with multiple features to encode the interacting relationships between each individual in the group. Furthermore, to detect and identify the concurrent interactive simultaneously, we investigate the Relative Entropy as a metric to measure the reasonable motion dependency between two arbitrary individuals. Filtered by the causality dependency, causality motion features have been cast as the multiplicative probabilistic ingredients in Bayesian factors to formulate the compact learned latent interaction patterns aggregately that enable the power of discrimination. Experiments demonstrate our model outperforms state-of-the-art models.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Asadi, Khashayar; Chen, Pengyu; Han, Kevin K; Wu, Tianfu; Lobaton, Edgar J
LNSNet: Lightweight Navigable Space Segmentation for Autonomous Robots on Construction Sites Journal Article
In: Data, vol. 4, no. 1, pp. 40, 2019.
@article{LNSNet,
title = {LNSNet: Lightweight Navigable Space Segmentation for Autonomous Robots
on Construction Sites},
author = {Khashayar Asadi and Pengyu Chen and Kevin K Han and Tianfu Wu and Edgar J Lobaton},
url = {https://www.mdpi.com/2306-5729/4/1/40},
doi = {10.3390/data4010040},
year = {2019},
date = {2019-03-13},
journal = {Data},
volume = {4},
number = {1},
pages = {40},
abstract = {An autonomous robot that can monitor a construction site should be able to be can contextually detect its surrounding environment by recognizing objects and making decisions based on its observation. Pixel-wise semantic segmentation in real-time is vital to building an autonomous and mobile robot. However, the learning models’ size and high memory usage associated with real-time segmentation are the main challenges for mobile robotics systems that have limited computing resources. To overcome these challenges, this paper presents an efficient semantic segmentation method named LNSNet (lightweight navigable space segmentation network) that can run on embedded platforms to determine navigable space in real-time. The core of model architecture is a new block based on separable convolution which compresses the parameters of present residual block meanwhile maintaining the accuracy and performance. LNSNet is faster, has fewer parameters and less model size, while provides similar accuracy compared to existing models. A new pixel-level annotated dataset for real-time and mobile navigable space segmentation in construction environments has been constructed for the proposed method. The results demonstrate the effectiveness and efficiency that are necessary for the future development of the autonomous robotics systems.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Li, Xilai; Song, Xi; Wu, Tianfu
AOGNets: Compositional Grammatical Architectures for Deep Learning Proceedings Article
In: IEEE Conference on Computer Vision and Pattern Recognition (CVRP), 2019.
@inproceedings{AOGNets,
title = {AOGNets: Compositional Grammatical Architectures for Deep Learning},
author = {Xilai Li and Xi Song and Tianfu Wu},
url = {http://openaccess.thecvf.com/content_CVPR_2019/papers/Li_AOGNets_Compositional_Grammatical_Architectures_for_Deep_Learning_CVPR_2019_paper.pdf
https://github.com/iVMCL/AOGNets
https://www.wraltechwire.com/2019/05/21/ncsu-researchers-create-framework-for-a-smarter-ai-are-seeking-patent/
https://www.technologynetworks.com/tn/news/new-framework-enhances-neural-network-performance-319704
},
year = {2019},
date = {2019-06-18},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVRP)},
abstract = {Neural architectures are the foundation for improving performance of deep neural networks (DNNs). This paper presents deep compositional grammatical architectures which harness the best of two worlds: grammar models and DNNs. The proposed architectures integrate compositionality and reconfigurability of the former and the capability of learning rich features of the latter in a principled way. We utilize AND-OR Grammar (AOG) as network generator in this paper and call the resulting networks AOGNets. An AOGNet consists of a number of stages each of which is composed of a number of AOG building blocks. An AOG building block splits its input feature map into N groups along feature channels and then treat it as a sentence of N words. It then jointly realizes a phrase structure grammar and a dependency grammar in bottom-up parsing the “sentence” for better feature exploration and reuse. It provides a unified framework for the best practices developed in state-of-the-art DNNs. In experiments, AOGNet is tested in the ImageNet-1K classification benchmark and the MS-COCO object detection and segmentation benchmark. In ImageNet-1K, AOGNet obtains better performance than ResNet and most of its variants, ResNeXt and its attention based variants such as SENet, DenseNet and DualPathNet. AOGNet also obtains the best model interpretability score using network dissection. AOGNet further shows better potential in adversarial defense. In MS-COCO, AOGNet obtains better performance than the ResNet and ResNeXt backbones in Mask R-CNN.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Sun, Wei; Wu, Tianfu
Image Synthesis from Reconfigurable Layout and Style Proceedings Article
In: International Conference on Computer Vision (ICCV), 2019.
@inproceedings{LostGAN,
title = {Image Synthesis from Reconfigurable Layout and Style},
author = {Wei Sun and Tianfu Wu},
url = {https://arxiv.org/abs/1908.07500
https://github.com/iVMCL/LostGANs},
year = {2019},
date = {2019-10-28},
booktitle = {International Conference on Computer Vision (ICCV)},
abstract = {Despite remarkable recent progress on both unconditional and conditional image synthesis, it remains a long-standing problem to learn generative models that are capable of synthesizing realistic and sharp images from reconfigurable spatial layout (i.e., bounding boxes + class labels in an image lattice) and style (i.e., structural and appearance variations encoded by latent vectors), especially at high resolution. By reconfigurable, it means that a model can preserve the intrinsic one-to-many mapping from a given layout to multiple plausible images with different styles, and is adaptive with respect to perturbations of a layout and style latent code. In this paper, we present a layout- and style-based architecture for generative adversarial networks (termed LostGANs) that can be trained end-to-end to generate images from reconfigurable layout and style. Inspired by the vanilla StyleGAN, the proposed LostGAN consists of two new components: (i) learning fine-grained mask maps in a weakly-supervised manner to bridge the gap between layouts and images, and (ii) learning object instance-specific layout-aware feature normalization (ISLA-Norm) in the generator to realize multi-object style generation. In experiments, the proposed method is tested on the COCO-Stuff dataset and the Visual Genome dataset with state-of-the-art performance obtained.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Li, Xilai; Zhou, Yingbo; Wu, Tianfu; Socher, Richard; Xiong, Caiming
Learn to Grow: A Continual Structure Learning Framework for Overcoming Catastrophic Forgetting Proceedings Article
In: International Conference on Machine Learning (ICML), 2019.
@inproceedings{Learn2grow,
title = {Learn to Grow: A Continual Structure Learning Framework for Overcoming Catastrophic Forgetting},
author = {Xilai Li and Yingbo Zhou and Tianfu Wu and Richard Socher and Caiming Xiong},
url = {https://arxiv.org/abs/1904.00310
https://news.ncsu.edu/2019/05/ai-continual-learning-framework/
https://www.army.mil/article/222090/army_funded_research_boosts_memory_of_ai_systems
https://news.science360.gov/archives/20190517
https://techxplore.com/news/2019-05-framework-artificial-intelligence.html
https://www.wraltechwire.com/2019/05/15/researchers-create-framework-to-help-artificial-intelligence-systems-be-less-forgetful/},
year = {2019},
date = {2019-06-11},
booktitle = {International Conference on Machine Learning (ICML)},
abstract = {Addressing catastrophic forgetting is one of the key challenges in continual learning where machine learning systems are trained with sequential or streaming tasks. Despite recent remarkable progress in state-of-the-art deep learning, deep neural networks (DNNs) are still plagued with the catastrophic forgetting problem. This paper presents a conceptually simple yet general and effective framework for handling catastrophic forgetting in continual learning with DNNs. The proposed method consists of two components: a neural structure optimization component and a parameter learning and/or fine-tuning component. By separating the explicit neural structure learning and the parameter estimation, not only is the proposed method capable of evolving neural structures in an intuitively meaningful way, but also shows strong capabilities of alleviating catastrophic forgetting in experiments. Furthermore, the proposed method outperforms all other baselines on the permuted MNIST dataset, the split CIFAR100 dataset and the Visual Domain Decathlon dataset in continual learning setting.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
- https://arxiv.org/abs/1904.00310
- https://news.ncsu.edu/2019/05/ai-continual-learning-framework/
- https://www.army.mil/article/222090/army_funded_research_boosts_memory_of_ai_sys[...]
- https://news.science360.gov/archives/20190517
- https://techxplore.com/news/2019-05-framework-artificial-intelligence.html
- https://www.wraltechwire.com/2019/05/15/researchers-create-framework-to-help-art[...]
Xue, Nan; Bai, Song; Wang, Fudong; Xia, Gui-Song; Wu, Tianfu; Zhang, Liangpei
Learning Attraction Field Representation for Robust Line Segment Detection Proceedings Article
In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2019.
@inproceedings{AFM_LSD,
title = {Learning Attraction Field Representation for Robust Line Segment Detection},
author = {Nan Xue and Song Bai and Fudong Wang and Gui-Song Xia and Tianfu Wu and Liangpei Zhang},
url = {https://arxiv.org/abs/1812.02122
https://github.com/cherubicXN/afm_cvpr2019},
year = {2019},
date = {2019-06-18},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
abstract = {This paper presents a region-partition based attraction field dual representation for line segment maps, and thus poses the problem of line segment detection (LSD) as the region coloring problem. The latter is then addressed by learning deep convolutional neural networks (ConvNets) for accuracy, robustness and efficiency. For a 2D line segment map, our dual representation consists of three components: (i) A region-partition map in which every pixel is assigned to one and only one line segment; (ii) An attraction field map in which every pixel in a partition region is encoded by its 2D projection vector w.r.t. the associated line segment; and (iii) A squeeze module which squashes the attraction field to a line segment map that almost perfectly recovers the input one. By leveraging the duality, we learn ConvNets to compute the attraction field maps for raw in-put images, followed by the squeeze module for LSD, in an end-to-end manner. Our method rigorously addresses several challenges in LSD such as local ambiguity and class imbalance. Our method also harnesses the best practices developed in ConvNets based semantic segmentation methods such as the encoder-decoder architecture and the a-trous convolution. In experiments, our method is tested on the WireFrame dataset and the YorkUrban dataset with state-of-the-art performance obtained. Especially, we advance the performance by 4.5 percents on the WireFrame dataset. Our method is also fast with 6.6~10.4 FPS, outperforming most of existing line segment detectors.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Wu, Tianfu; Song, Xi
Towards Interpretable Object Detection by Unfolding Latent Structures Proceedings Article
In: International Conference on Computer Vision (ICCV), 2019.
@inproceedings{iRCNN,
title = {Towards Interpretable Object Detection by Unfolding Latent Structures},
author = {Tianfu Wu and Xi Song},
year = {2019},
date = {2019-10-28},
booktitle = {International Conference on Computer Vision (ICCV)},
abstract = {This paper first proposes a method of formulating model interpretability in visual understanding tasks based on the idea of unfolding latent structures. It then presents a case study in object detection using popular two-stage region- based convolutional network (i.e., R-CNN) detection systems. The proposed method focuses on weakly-supervised extractive rationale generation, that is learning to unfold latent discriminative part configurations of object instances automatically and simultaneously in de- tection without using any supervision for part configura- tions. It utilizes a top-down hierarchical and compositional grammar model embedded in a directed acyclic AND-OR Graph (AOG) to explore and unfold the space of latent part configurations of regions of interest (RoIs). It presents an AOGParsing operator that seamlessly integrates with the RoIPooling/RoIAlign operator widely used in R-CNN and is trained end-to-end. In object detection, a bounding box is interpreted by the best parse tree derived from the AOG on-the-fly, which is treated as the qualita- tively extractive rationale generated for interpreting detec- tion. In experiments, Faster R-CNN [50] is used to test the proposed method on the PASCAL VOC 2007 and the COCO 2017 object detection datasets. The experimental results show that the proposed method can com- pute promising latent structures without hurting the performance.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Sun, Wei; Wu, Tianfu
Learning Spatial Pyramid Attentive Pooling in Image Synthesis and Image-to-Image Translation Miscellaneous
arXiv preprint, 2019.
@misc{SPAP,
title = {Learning Spatial Pyramid Attentive Pooling in Image Synthesis and Image-to-Image Translation},
author = {Wei Sun and Tianfu Wu},
url = {https://arxiv.org/abs/1901.06322},
year = {2019},
date = {2019-01-01},
journal = {CoRR},
volume = {abs/1901.06322},
abstract = {Image synthesis and image-to-image translation are two important generative learning tasks. Remarkable progress has been made by learning Generative Adversarial Networks (GANs) and cycle-consistent GANs (CycleGANs) respectively. This paper presents a method of learning Spatial Pyramid Attentive Pooling (SPAP) which is a novel architectural unit and can be easily integrated into both generators and discriminators in GANs and CycleGANs. The proposed SPAP integrates Atrous spatial pyramid, a proposed cascade attention mechanism and residual connections. It leverages the advantages of the three components to facilitate effective end-to-end generative learning: (i) the capability of fusing multi-scale information by ASPP; (ii) the capability of capturing relative importance between both spatial locations (especially multi-scale context) or feature channels by attention; (iii) the capability of preserving information and enhancing optimization feasibility by residual connections. Coarse-to-fine and fine-to-coarse SPAP are studied and intriguing attention maps are observed in both tasks. In experiments, the proposed SPAP is tested in GANs on the Celeba-HQ-128 dataset, and tested in CycleGANs on the Image-to-Image translation datasets including the Cityscape dataset, Facade and Aerial Maps dataset, both obtaining better performance.},
howpublished = {arXiv preprint},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
2018
Roheda, Siddharth; Krim, Hamid; Luo, Zhi-Quan; Wu, Tianfu
Decision Level Fusion: An Event Driven Approach Proceedings Article
In: 26th European Signal Processing Conference, EUSIPCO 2018, Roma, Italy, September 3-7, 2018, pp. 2598–2602, 2018.
@inproceedings{DBLP:conf/eusipco/RohedaKLW18,
title = {Decision Level Fusion: An Event Driven Approach},
author = {Siddharth Roheda and Hamid Krim and Zhi-Quan Luo and Tianfu Wu},
url = {https://doi.org/10.23919/EUSIPCO.2018.8553412},
doi = {10.23919/EUSIPCO.2018.8553412},
year = {2018},
date = {2018-01-01},
booktitle = {26th European Signal Processing Conference, EUSIPCO 2018, Roma,
Italy, September 3-7, 2018},
pages = {2598\textendash2602},
crossref = {DBLP:conf/eusipco/2018},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Qi, Hang; Xu, Yuanlu; Yuan, Tao; Wu, Tianfu; Zhu, Song-Chun
Joint Parsing of Cross-view Scenes with Spatio-temporal Semantic Parse Graphs Proceedings Article
In: Proceedings of The Thirty-Second AAAI Conference on Artificial Intelligence (AAAI), New Orleans, Lousiana, USA., February 2–7, pp. 1–4, 2018.
@inproceedings{JointParsing,
title = {Joint Parsing of Cross-view Scenes with Spatio-temporal Semantic Parse Graphs},
author = {Hang Qi and Yuanlu Xu and Tao Yuan and Tianfu Wu and Song-Chun Zhu},
url = {https://arxiv.org/pdf/1709.05436.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of The Thirty-Second AAAI Conference on Artificial Intelligence (AAAI), New Orleans, Lousiana, USA., February 2\textendash7},
pages = {1--4},
abstract = {Cross-view video understanding is an important yet underexplored area in computer vision. In this paper, we introduce a joint parsing method that takes view-centric proposals from pre-trained computer vision models and produces spatiotemporal parse graphs that represents a coherent scene-centric understanding of cross-view scenes. Our key observations are that overlapping fields of views embed rich appearance and geometry correlations and that knowledge segments corresponding to individual vision tasks are governed by consistency constraints available in commonsense knowledge. The proposed joint parsing framework models such correlations and constraints explicitly and generates semantic parse graphs about the scene. Quantitative experiments show that scene-centric predictions in the parse graph outperform viewcentric predictions.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Li, Bo; Xiong, Caiming; Wu, Tianfu; Zhou, Yu; Zhang, Lun; Chu, Rufeng
Neural Abstract Style Transfer for Chinese Traditional Painting Proceedings Article
In: Asian Conference on Computer Vision (ACCV), 2018.
@inproceedings{ChinesePainting,
title = {Neural Abstract Style Transfer for Chinese Traditional Painting},
author = {Bo Li and Caiming Xiong and Tianfu Wu and Yu Zhou and Lun Zhang and Rufeng Chu},
url = {https://arxiv.org/abs/1812.03264},
year = {2018},
date = {2018-01-01},
booktitle = {Asian Conference on Computer Vision (ACCV)},
abstract = {Chinese traditional painting is one of the most historical artworks in the world. It is very popular in Eastern and Southeast Asia due to being aesthetically appealing. Compared with western artistic painting, it is usually more visually abstract and textureless. Recently, neural network based style transfer methods have shown promising and appealing results which are mainly focused on western painting. It remains a challenging problem to preserve abstraction in neural style transfer. In this paper, we present a Neural Abstract Style Transfer method for Chinese traditional painting. It learns to preserve abstraction and other style jointly end-to-end via a novel MXDoG-guided filter (Modified version of the eXtended Difference-of-Gaussians) and three fully differentiable loss terms. To the best of our knowledge, there is little work study on neural style transfer of Chinese traditional painting. To promote research on this direction, we collect a new dataset with diverse photo-realistic images and Chinese traditional paintings. In experiments, the proposed method shows more appealing stylized results in transferring the style of Chinese traditional painting than state-of-the-art neural style transfer methods.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Li, Bo; Wu, Tianfu; Zhang, Lun; Chu, Rufeng
Auto-Context RCNN Miscellaneous
arXiv preprint, 2018.
@misc{AutoCtxRCNN,
title = {Auto-Context RCNN},
author = {Bo Li and Tianfu Wu and Lun Zhang and Rufeng Chu},
url = {https://arxiv.org/abs/1807.02842},
year = {2018},
date = {2018-01-01},
journal = {CoRR},
volume = {abs/1807.02842},
abstract = {Region-based convolutional neural networks (R-CNN) have largely dominated object detection. Operators defined on RoIs (Region of Interests) play an important role in R-CNNs such as RoIPooling~citefast_rcnn and RoIAlign~citemask_rcnn. They all only utilize information inside RoIs for RoI prediction, even with their recent deformable extensions~citedeformable_cnn. Although surrounding context is well-known for its importance in object detection, it has yet been integrated in R-CNNs in a flexible and effective way. Inspired by the auto-context work and the multi-class object layout work, this paper presents a generic context-mining RoI operator (i.e., RoICtxMining) seamlessly integrated in R-CNNs, and the resulting object detection system is termed Auto-Context R-CNN which is trained end-to-end. The proposed RoICtxMining operator is a simple yet effective two-layer extension of the RoIPooling or RoIAlign operator. Centered at an object-RoI, it creates a 3×3 layout to mine contextual information adaptively in the 8 surrounding context regions on-the-fly. Within each of the 8 context regions, a context-RoI is mined in term of discriminative power and its RoIPooling / RoIAlign features are concatenated with the object-RoI for final prediction. The proposed Auto-Context R-CNN is robust to occlusion and small objects, and shows promising vulnerability for adversarial attacks without being adversarially-trained. In experiments, it is evaluated using RoIPooling as the backbone and shows competitive results on Pascal VOC, Microsoft COCO, and KITTI datasets (including 6.9% mAP improvements over the R-FCN method on COCO test-dev dataset and the first place on both KITTI pedestrian and cyclist detection as of this submission).},
howpublished = {arXiv preprint},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Chen, Zeyuan; Nie, Shaoliang; Wu, Tianfu; Healey, Christopher G
arXiv preprint, 2018.
@misc{FaceCompletion,
title = {High Resolution Face Completion with Multiple Controllable Attributes via Fully End-to-End Progressive Generative Adversarial Networks},
author = {Zeyuan Chen and Shaoliang Nie and Tianfu Wu and Christopher G Healey},
url = {https://arxiv.org/abs/1801.07632},
year = {2018},
date = {2018-01-01},
journal = {CoRR},
volume = {abs/1801.07632},
abstract = {We present a deep learning approach for high resolution face completion with multiple controllable attributes (e.g., male and smiling) under arbitrary masks. Face completion entails understanding both structural meaningfulness and appearance consistency locally and globally to fill in “holes" whose content do not appear elsewhere in an input image. It is a challenging task with the difficulty level increasing significantly with respect to high resolution, the complexity of “holes" and the controllable attributes of filled-in fragments. Our system addresses the challenges by learning a fully end-to-end framework that trains generative adversarial networks (GANs) progressively from low resolution to high resolution with conditional vectors encoding controllable attributes. We design novel network architectures to exploit information across multiple scales effectively and efficiently. We introduce new loss functions encouraging sharp completion. We show that our system can complete faces with large structural and appearance variations using a single feed-forward pass of computation with mean inference time of 0.007 seconds for images at 1024 × 1024 resolution. We also perform a pilot human study that shows our approach outperforms state-of-the-art face completion methods in terms of rank analysis.},
howpublished = {arXiv preprint},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Chen, Zexi; Ramachandra, Bharathkumar; Wu, Tianfu; Vatsavai, Ranga Raju
Relational Long Short-Term Memory for Video Action Recognition Miscellaneous
arXiv preprint, 2018.
@misc{RelationalLSTM,
title = {Relational Long Short-Term Memory for Video Action Recognition},
author = {Zexi Chen and Bharathkumar Ramachandra and Tianfu Wu and Ranga Raju Vatsavai},
url = {https://arxiv.org/abs/1811.07059},
year = {2018},
date = {2018-01-01},
journal = {CoRR},
volume = {abs/1811.07059},
abstract = {Spatial and temporal relationships, both short-range and long-range, between objects in videos are key cues for recognizing actions. It is a challenging problem to model them jointly. In this paper, we first present a new variant of Long Short-Term Memory, namely Relational LSTM to address the challenge for relation reasoning across space and time between objects. In our Relational LSTM module, we utilize a non-local operation similar in spirit to the recently proposed non-local network to substitute the fully connected operation in the vanilla LSTM. By doing this, our Relational LSTM is capable of capturing long and short-range spatio-temporal relations between objects in videos in a principled way. Then, we propose a two-branch neural architecture consisting of the Relational LSTM module as the non-local branch and a spatio-temporal pooling based local branch. The local branch is introduced for capturing local spatial appearance and/or short-term motion features. The two-branch modules are concatenated to learn video-level features from snippet-level ones end-to-end. Experimental results on UCF-101 and HMDB-51 datasets show that our model achieves state-of-the-art results among LSTM-based methods, while obtaining comparable performance with other state-of-the-art methods (which use not directly comparable schema). Our code will be released.},
howpublished = {arXiv preprint},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
2017
Wu, Tianfu; Lu, Yang; Zhu, Song-Chun
Online Object Tracking, Learning and Parsing with And-Or Graphs Journal Article
In: IEEE Trans. Pattern Anal. Mach. Intell. (TPAMI), vol. 39, no. 12, pp. 2465–2480, 2017.
@article{TLP-PAMI,
title = {Online Object Tracking, Learning and Parsing with And-Or Graphs},
author = {Tianfu Wu and Yang Lu and Song-Chun Zhu},
url = {http://arxiv.org/abs/1509.08067
https://github.com/tfwu/RGM-AOGTracker},
doi = {10.1109/TPAMI.2016.2644963},
year = {2017},
date = {2017-01-01},
urldate = {2017-01-01},
journal = {IEEE Trans. Pattern Anal. Mach. Intell. (TPAMI)},
volume = {39},
number = {12},
pages = {2465--2480},
abstract = {This paper presents a method, called AOGTracker, for simultaneously tracking, learning and parsing (TLP) of unknown objects in video sequences with a hierarchical and compositional And-Or graph (AOG) representation. The TLP method is formulated in the Bayesian framework with a spatial and a temporal dynamic programming (DP) algorithms inferring object bounding boxes on-the-fly. During online learning, the AOG is discriminatively learned using latent SVM to account for appearance (e.g., lighting and partial occlusion) and structural (e.g., different poses and viewpoints) variations of a tracked object, as well as distractors (e.g., similar objects) in background. Three key issues in online inference and learning are addressed: (i) maintaining purity of positive and negative examples collected online, (ii) controling model complexity in latent structure learning, and (iii) identifying critical moments to re-learn the structure of AOG based on its intrackability. The intrackability measures uncertainty of an AOG based on its score maps in a frame. In experiments, our AOGTracker is tested on two popular tracking benchmarks with the same parameter setting: the TB-100/50/CVPR2013 benchmarks, and the VOT benchmarks --- VOT 2013, 2014, 2015 and TIR2015 (thermal imagery tracking). In the former, our AOGTracker outperforms state-of-the-art tracking algorithms including two trackers based on deep convolutional network. In the latter, our AOGTracker outperforms all other trackers in VOT2013 and is comparable to the state-of-the-art methods in VOT2014, 2015 and TIR2015.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Zhao, Bo; Wu, Botong; Wu, Tianfu; Wang, Yizhou
Zero-Shot Learning Posed as a Missing Data Problem Proceedings Article
In: 2017 IEEE International Conference on Computer Vision Workshops, ICCV Workshops 2017, Venice, Italy, October 22-29, 2017, pp. 2616–2622, 2017.
@inproceedings{DBLP:conf/iccvw/ZhaoWWW17,
title = {Zero-Shot Learning Posed as a Missing Data Problem},
author = {Bo Zhao and Botong Wu and Tianfu Wu and Yizhou Wang},
url = {https://doi.org/10.1109/ICCVW.2017.310},
doi = {10.1109/ICCVW.2017.310},
year = {2017},
date = {2017-01-01},
booktitle = {2017 IEEE International Conference on Computer Vision Workshops,
ICCV Workshops 2017, Venice, Italy, October 22-29, 2017},
pages = {2616\textendash2622},
crossref = {DBLP:conf/iccvw/2017},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2016
Li, Yunzhu; Sun, Benyuan; Wu, Tianfu; Wang, Yizhou
Face Detection with End-to-End Integration of a ConvNet and a 3D Model Proceedings Article
In: Proceedings of The 14th European Conference on Computer Vision (ECCV), Amsterdam, The Netherlands, October 11-14, 2016.
@inproceedings{FaceDet-ConvNet-3D,
title = {Face Detection with End-to-End Integration of a ConvNet and a 3D Model},
author = {Yunzhu Li and Benyuan Sun and Tianfu Wu and Yizhou Wang},
url = {http://arxiv.org/abs/1606.00850
https://github.com/tfwu/FaceDetection-ConvNet-3D},
year = {2016},
date = {2016-01-01},
booktitle = {Proceedings of The 14th European Conference on Computer Vision (ECCV), Amsterdam, The Netherlands, October 11-14},
abstract = {This paper presents a method for face detection in the wild, which integrates a ConvNet and a 3D mean face model in an end-to-end multi-task discriminative learning framework. The 3D mean face model is predefined and fixed (e.g., we used the one provided in the AFLW dataset). The ConvNet consists of two components: (i) The face pro- posal component computes face bounding box proposals via estimating facial key-points and the 3D transformation (rotation and translation) parameters for each predicted key-point w.r.t. the 3D mean face model. (ii) The face verification component computes detection results by prun- ing and refining proposals based on facial key-points based configuration pooling. The proposed method addresses two issues in adapting state- of-the-art generic object detection ConvNets (e.g., faster R-CNN) for face detection: (i) One is to eliminate the heuristic design of prede- fined anchor boxes in the region proposals network (RPN) by exploit- ing a 3D mean face model. (ii) The other is to replace the generic RoI (Region-of-Interest) pooling layer with a configuration pooling layer to respect underlying object structures. The multi-task loss consists of three terms: the classification Softmax loss and the location smooth l1 -losses [14] of both the facial key-points and the face bounding boxes. In ex- periments, our ConvNet is trained on the AFLW dataset only and tested on the FDDB benchmark with fine-tuning and on the AFW benchmark without fine-tuning. The proposed method obtains very competitive state-of-the-art performance in the two benchmarks.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Li, Bo; Wu, Tianfu; Shao, Shuai; Zhang, Lun; Chu, Rufeng
arXiv preprint, 2016.
@misc{ARC-FCN,
title = {Object Detection via End-to-End Integration of Aspect Ratio and Context Aware Part-based Models and Fully Convolutional Networks},
author = {Bo Li and Tianfu Wu and Shuai Shao and Lun Zhang and Rufeng Chu},
url = {https://arxiv.org/abs/1612.00534},
year = {2016},
date = {2016-01-01},
journal = {CoRR},
volume = {abs/1612.00534},
abstract = {This paper presents a framework of integrating a mixture of part-based models and region-based convolutional networks for accurate and efficient object detection. Each mixture component consists of a small number of parts accounting for both object aspect ratio and contextual information explicitly. The mixture is category-agnostic for the simplicity of scaling up in applications. Both object aspect ratio and context have been extensively studied in traditional object detection systems such as the mixture of deformable part-based models [13]. They are, however, largely ignored in deep neural network based detection systems [17, 16, 39, 8]. The proposed method addresses this issue in two-fold: (i) It remedies the wrapping artifact due to the generic RoI (region-of-interest) pooling (e.g., a 3 x 3 grid) by taking into account object aspect ratios. (ii) It models both global (from the whole image) and local (from the surrounding of a bounding box) context for improving performance. The integrated framework is fully convolutional and enjoys end-to-end training, which we call the aspect ratio and context aware fully convolutional network (ARC-FCN). In experiments, ARC-FCN shows very competitive results on the PASCAL VOC datasets, especially, it outperforms both Faster R-CNN [39] and R-FCN [8] with significantly better mean average precision (mAP) using larger value for the intersection-over-union (IoU) threshold (i.e., 0.7 in the experiments). ARC-FCN is still sufficiently efficient with a test-time speed of 380ms per image, faster than the Faster R-CNN but slower than the R-FCN.},
howpublished = {arXiv preprint},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Chen, Diqi; Wang, Yizhou; Wu, Tianfu; Gao, Wen
Recurrent Attentional Model for No-Reference Image Quality Assessment Miscellaneous
arXiv preprint, 2016.
@misc{Chen_IQA,
title = {Recurrent Attentional Model for No-Reference Image Quality Assessment},
author = {Diqi Chen and Yizhou Wang and Tianfu Wu and Wen Gao},
url = {https://arxiv.org/abs/1612.03530},
year = {2016},
date = {2016-01-01},
journal = {CoRR},
volume = {abs/1612.03530},
abstract = {This paper presents a recurrent attentional model (RAM) for general no-reference image quality assessment (NR-IQA), that is to predict the perceptual quality score for an input image without using any reference image and/or prior knowledge regarding underlying distortions. The proposed RAM is inspired by the well known visual attention mechanism, both covert and overt, which affects many aspects of visual perception including image quality assessment. The attentional mechanism is, however, largely ignored in the NR-IQA literature. The proposed RAM hypothesizes that the attentional scanning path in an image should contain intrinsic information for IQA. The RAM thus consists of three components: a glimpse sub-network analyzing the quality at a fixation using multi-scale information, a location sub-network selecting where to look next by sampling a stochastic node, and a recurrent network aggregating information along the scanning path to compute the final prediction. The RAM is formulated under multi-task learning for the joint prediction of distortion type and image quality score and for the REINFORCE rule~citewilliams1992simple used to handle the stochastic node. The RAM is trained through back-propagation. In experiments, the RAM is tested on the TID2008 dataset with promising performance obtained, which shows the effectiveness of the proposed RAM. Furthermore, the RAM is very efficient in the sense that a small number of glimpses are used usually in testing.},
howpublished = {arXiv preprint},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}