@article{oai:kanazawa-u.repo.nii.ac.jp:00007931, author = {Hirano, Akihiro and Nakayama, Kenji}, journal = {ISPACS 2010 - 2010 International Symposium on Intelligent Signal Processing and Communication Systems, Proceedings}, month = {Jan}, note = {This paper presents implementations of an FIR adaptive filter with a large number of taps on nVIDIA GeForce graphics processing unit (GPU) and CUOA software development environment. In order to overcome a long access latency for slow off-chip memory access, reduction of memory accesses by re-ordering and vector load/store operations and an increase of the number of threads are introduced. A tree adder is introduced to reduce the cost for summing thread outputs up. A simultaneous execution of multiple filters are also examined. On low-cost platform such as an Atom/ION nettop, GPU will accelerates the computation by almost three times. For simultaneous multiple simulations such as an ensemble averaging, a GPU with a large number of processing elements outperforms a dual-core CPU; almost six times faster for 16 runs. © 2010 IEEE, 金沢大学理工研究域電子情報学系}, title = {Implementation of large-scale FIR adaptive filters on nVIDIA GeForce graphics processing unit}, year = {2010} }