Re-se-arch
Our re-se-arch has been generously supported by ARO, NSF, ARFL, IARPA, BlueHalo and Salesforce.
2021
Sun, Wei; Wu, Tianfu
Learning Layout and Style Reconfigurable GANs for Controllable Image Synthesis Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI), 2021.
@article{LostGANs,
title = {Learning Layout and Style Reconfigurable GANs for Controllable Image Synthesis},
author = {Wei Sun and Tianfu Wu},
url = {https://arxiv.org/abs/2003.11571},
doi = {10.1109/TPAMI.2021.3078577},
year = {2021},
date = {2021-05-01},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)},
abstract = {With the remarkable recent progress on learning deep generative models, it becomes increasingly interesting to develop models for controllable image synthesis from reconfigurable inputs. This paper focuses on a recent emerged task, layout-to-image, to learn generative models that are capable of synthesizing photo-realistic images from spatial layout (i.e., object bounding boxes configured in an image lattice) and style (i.e., structural and appearance variations encoded by latent vectors). This paper first proposes an intuitive paradigm for the task, layout-to-mask-to-image, to learn to unfold object masks of given bounding boxes in an input layout to bridge the gap between the input layout and synthesized images. Then, this paper presents a method built on Generative Adversarial Networks for the proposed layout-to-mask-to-image with style control at both image and mask levels. Object masks are learned from the input layout and iteratively refined along stages in the generator network. Style control at the image level is the same as in vanilla GANs, while style control at the object mask level is realized by a proposed novel feature normalization scheme, Instance-Sensitive and Layout-Aware Normalization. In experiments, the proposed method is tested in the COCO-Stuff dataset and the Visual Genome dataset with state-of-the-art performance obtained.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2020
Xing, Xianglei; Wu, Tianfu; Zhu, Song-Chun; Wu, Ying Nian
Towards Interpretable Image Synthesis by Learning Sparsely Connected AND-OR Networks Proceedings Article
In: IEEE Conference on Computer Vision and Pattern Recognition (CVRP), 2020., 2020.
@inproceedings{iGenerativeM,
title = {Towards Interpretable Image Synthesis by Learning Sparsely Connected AND-OR Networks},
author = {Xianglei Xing and Tianfu Wu and Song-Chun Zhu and Ying Nian Wu},
url = {https://arxiv.org/abs/1909.04324},
year = {2020},
date = {2020-02-23},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVRP), 2020.},
journal = {CoRR},
abstract = {This paper proposes interpretable image synthesis by learning hierarchical AND-OR networks of sparsely connected semantically meaningful nodes. The proposed method is based on the compositionality and interpretability of scene-objects-parts-subparts-primitives hierarchy in image representation. A scene has different types (i.e., OR) each of which consists of a number of objects (i.e., AND). This can be recursively formulated across the scene-objects-parts-subparts hierarchy and is terminated at the primitive level (e.g., Gabor wavelets-like basis). To realize this interpretable AND-OR hierarchy in image synthesis, the proposed method consists of two components: (i) Each layer of the hierarchy is represented by an over-completed set of basis functions. The basis functions are instantiated using convolution to be translation covariant. Off-the-shelf convolutional neural architectures are then exploited to implement the hierarchy. (ii) Sparsity-inducing constraints are introduced in end-to-end training, which facilitate a sparsely connected AND-OR network to emerge from initially densely connected convolutional neural networks. A straightforward sparsity-inducing constraint is utilized, that is to only allow the top-k basis functions to be active at each layer (where k is a hyperparameter). The learned basis functions are also capable of image reconstruction to explain away input images. In experiments, the proposed method is tested on five benchmark datasets. The results show that meaningful and interpretable hierarchical representations are learned with better qualities of image synthesis and reconstruction obtained than state-of-the-art baselines.},
howpublished = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2020},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2019
Sun, Wei; Wu, Tianfu
Image Synthesis from Reconfigurable Layout and Style Proceedings Article
In: International Conference on Computer Vision (ICCV), 2019.
@inproceedings{LostGAN,
title = {Image Synthesis from Reconfigurable Layout and Style},
author = {Wei Sun and Tianfu Wu},
url = {https://arxiv.org/abs/1908.07500
https://github.com/iVMCL/LostGANs},
year = {2019},
date = {2019-10-28},
booktitle = {International Conference on Computer Vision (ICCV)},
abstract = {Despite remarkable recent progress on both unconditional and conditional image synthesis, it remains a long-standing problem to learn generative models that are capable of synthesizing realistic and sharp images from reconfigurable spatial layout (i.e., bounding boxes + class labels in an image lattice) and style (i.e., structural and appearance variations encoded by latent vectors), especially at high resolution. By reconfigurable, it means that a model can preserve the intrinsic one-to-many mapping from a given layout to multiple plausible images with different styles, and is adaptive with respect to perturbations of a layout and style latent code. In this paper, we present a layout- and style-based architecture for generative adversarial networks (termed LostGANs) that can be trained end-to-end to generate images from reconfigurable layout and style. Inspired by the vanilla StyleGAN, the proposed LostGAN consists of two new components: (i) learning fine-grained mask maps in a weakly-supervised manner to bridge the gap between layouts and images, and (ii) learning object instance-specific layout-aware feature normalization (ISLA-Norm) in the generator to realize multi-object style generation. In experiments, the proposed method is tested on the COCO-Stuff dataset and the Visual Genome dataset with state-of-the-art performance obtained.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Sun, Wei; Wu, Tianfu
Learning Spatial Pyramid Attentive Pooling in Image Synthesis and Image-to-Image Translation Miscellaneous
arXiv preprint, 2019.
@misc{SPAP,
title = {Learning Spatial Pyramid Attentive Pooling in Image Synthesis and Image-to-Image Translation},
author = {Wei Sun and Tianfu Wu},
url = {https://arxiv.org/abs/1901.06322},
year = {2019},
date = {2019-01-01},
journal = {CoRR},
volume = {abs/1901.06322},
abstract = {Image synthesis and image-to-image translation are two important generative learning tasks. Remarkable progress has been made by learning Generative Adversarial Networks (GANs) and cycle-consistent GANs (CycleGANs) respectively. This paper presents a method of learning Spatial Pyramid Attentive Pooling (SPAP) which is a novel architectural unit and can be easily integrated into both generators and discriminators in GANs and CycleGANs. The proposed SPAP integrates Atrous spatial pyramid, a proposed cascade attention mechanism and residual connections. It leverages the advantages of the three components to facilitate effective end-to-end generative learning: (i) the capability of fusing multi-scale information by ASPP; (ii) the capability of capturing relative importance between both spatial locations (especially multi-scale context) or feature channels by attention; (iii) the capability of preserving information and enhancing optimization feasibility by residual connections. Coarse-to-fine and fine-to-coarse SPAP are studied and intriguing attention maps are observed in both tasks. In experiments, the proposed SPAP is tested in GANs on the Celeba-HQ-128 dataset, and tested in CycleGANs on the Image-to-Image translation datasets including the Cityscape dataset, Facade and Aerial Maps dataset, both obtaining better performance.},
howpublished = {arXiv preprint},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
2018
Li, Bo; Wu, Tianfu; Zhang, Lun; Chu, Rufeng
Auto-Context RCNN Miscellaneous
arXiv preprint, 2018.
@misc{AutoCtxRCNN,
title = {Auto-Context RCNN},
author = {Bo Li and Tianfu Wu and Lun Zhang and Rufeng Chu},
url = {https://arxiv.org/abs/1807.02842},
year = {2018},
date = {2018-01-01},
journal = {CoRR},
volume = {abs/1807.02842},
abstract = {Region-based convolutional neural networks (R-CNN) have largely dominated object detection. Operators defined on RoIs (Region of Interests) play an important role in R-CNNs such as RoIPooling~citefast_rcnn and RoIAlign~citemask_rcnn. They all only utilize information inside RoIs for RoI prediction, even with their recent deformable extensions~citedeformable_cnn. Although surrounding context is well-known for its importance in object detection, it has yet been integrated in R-CNNs in a flexible and effective way. Inspired by the auto-context work and the multi-class object layout work, this paper presents a generic context-mining RoI operator (i.e., RoICtxMining) seamlessly integrated in R-CNNs, and the resulting object detection system is termed Auto-Context R-CNN which is trained end-to-end. The proposed RoICtxMining operator is a simple yet effective two-layer extension of the RoIPooling or RoIAlign operator. Centered at an object-RoI, it creates a 3×3 layout to mine contextual information adaptively in the 8 surrounding context regions on-the-fly. Within each of the 8 context regions, a context-RoI is mined in term of discriminative power and its RoIPooling / RoIAlign features are concatenated with the object-RoI for final prediction. The proposed Auto-Context R-CNN is robust to occlusion and small objects, and shows promising vulnerability for adversarial attacks without being adversarially-trained. In experiments, it is evaluated using RoIPooling as the backbone and shows competitive results on Pascal VOC, Microsoft COCO, and KITTI datasets (including 6.9% mAP improvements over the R-FCN method on COCO test-dev dataset and the first place on both KITTI pedestrian and cyclist detection as of this submission).},
howpublished = {arXiv preprint},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}