<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href="http://www.blogger.com/styles/atom.css" type="text/css"?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' xmlns:georss='http://www.georss.org/georss' xmlns:gd='http://schemas.google.com/g/2005' xmlns:thr='http://purl.org/syndication/thread/1.0'><id>tag:blogger.com,1999:blog-447099751126587323</id><updated>2012-02-10T06:41:21.002+08:00</updated><category term='map/reduce'/><category term='metric learning'/><category term='GPU'/><category term='clustering'/><category term='Parzen window'/><category term='Markov logic'/><category term='adaptive algorithm'/><category term='variational approximation'/><category term='novel idea'/><category term='graph embedding'/><category term='product of experts'/><category term='continuation'/><category term='Gibbs sampler'/><category term='duality'/><category term='power-law degree distribution'/><category term='label propagation'/><category term='ranking'/><category term='robust learning'/><category term='multiple-label learning'/><category term='independent component analysis'/><category term='MRF'/><category term='SIGIR'/><category term='regression'/><category term='DC programming'/><category term='recommendation system'/><category term='ISOMAP'/><category term='binomial mixture model'/><category term='icml'/><category term='SVM'/><category term='natural language processing'/><category term='Fisher discriminant criterion'/><category term='gradient'/><category term='preference learning'/><category term='non-metric methods'/><category term='hashing'/><category term='aspecti mode'/><category term='MPI'/><category term='visualization'/><category term='cumulative distribution network'/><category term='SRBM'/><category term='information theory'/><category term='density estimation'/><category term='co-training'/><category term='simulated annealing'/><category term='icml 2006'/><category term='transfer learning'/><category term='matting'/><category term='dimension reduction'/><category term='boosting'/><category term='similarity'/><category term='sentiment analysis'/><category term='algorithm'/><category term='distance learning'/><category term='LDA'/><category term='HITS'/><category term='LPP'/><category term='real-time answer'/><category term='LTSA'/><category term='discriminative model'/><category term='PR'/><category term='information geometry'/><category term='consistency'/><category term='text'/><category term='JFA'/><category term='small world'/><category term='maximum entropy'/><category term='dynamic system'/><category term='causal inference'/><category term='optimization'/><category term='topic model'/><category term='mixture of experts'/><category term='large-scale problem'/><category term='frequentism'/><category term='spectral analysis'/><category term='bundle method'/><category term='deep belief network'/><category term='roughly scanned'/><category term='Bregman divergence'/><category term='bayesian framework'/><category term='collaborative filtering'/><category term='ensemble'/><category term='transductive learning'/><category term='unsupervised learning'/><category term='maximum margin'/><category term='multi-task learning'/><category term='compressive sampling'/><category term='relational learning'/><category term='manifold learning'/><category term='graph'/><category term='expectation propagation'/><category term='MCMC'/><category term='non-parametric Bayesian method'/><category term='parallel analysis'/><category term='decision tree'/><category term='online algorithm'/><category term='expectation maximization'/><category term='Nyström method'/><category term='feature extraction'/><category term='semidefinite programming'/><category term='portfolio effect'/><category term='locally adaptive classifier'/><category term='Dirichlet process'/><category term='stochastic gradient descent'/><category term='belief propagation'/><category term='random graph'/><category term='eccv2008'/><category term='feature selection'/><category term='probabilistic graphical model'/><category term='Laplacian Eigenmap'/><category term='supervised learning'/><category term='social network'/><category term='solver'/><category term='logistic regression'/><category term='generative model'/><category term='semi-supervised learning'/><category term='MMMF'/><category term='convolutional networks'/><category term='Gaussian process'/><category term='tangent space'/><category term='ordinal regression'/><category term='kernel selection'/><category term='computational photography'/><category term='fixed-point algorithm'/><category term='active learning'/><category term='RBM'/><category term='universum'/><category term='MVU'/><category term='random walk'/><category term='theoretical science'/><category term='dynamics'/><category term='tensor'/><category term='quadratic programming'/><category term='EM'/><category term='independence test'/><category term='texture'/><category term='sparsity'/><category term='Markov chain'/><category term='PageRank'/><category term='kernel'/><category term='regularization'/><category term='HMM'/><category term='multiple-instance learning'/><category term='LLE'/><category term='SDP'/><category term='nonnegative constraints'/><category term='ROC'/><category term='multiple-kernel learning'/><category term='PCA'/><category term='GMM'/><category term='Hopfield networks'/><title type='text'>Paper Scanner</title><subtitle type='html'></subtitle><link rel='http://schemas.google.com/g/2005#feed' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/posts/default'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default?max-results=100'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/'/><link rel='hub' href='http://pubsubhubbub.appspot.com/'/><link rel='next' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default?start-index=101&amp;max-results=100'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><generator version='7.00' uri='http://www.blogger.com'>Blogger</generator><openSearch:totalResults>201</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>100</openSearch:itemsPerPage><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-1348070926913464980</id><published>2012-02-02T16:13:00.000+08:00</published><updated>2012-02-02T16:13:04.455+08:00</updated><title type='text'>Frequency-tuned Salient Region Detection</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www.cs.utoronto.ca/%7Estrider/publications/SaliencyCVPR09.pdf" target="_blank"&gt;&lt;i&gt;Radhakrishna Achanta, Sheila Hemami, Francisco Estrada, and Sabine Su ̈sstrunk&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This is a very simple paper. The idea is to blur the image a little (with Gaussian) and compare the distance of each pixel to the mean. With this saliency map, DoG can be applied to extract the edges.&lt;br /&gt;&lt;br /&gt;This method will fail if the object has many colors or the background is complex.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-1348070926913464980?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/1348070926913464980/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=1348070926913464980' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/1348070926913464980'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/1348070926913464980'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2012/02/frequency-tuned-salient-region.html' title='Frequency-tuned Salient Region Detection'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-4573240528120346900</id><published>2011-12-31T22:18:00.001+08:00</published><updated>2011-12-31T22:18:34.270+08:00</updated><title type='text'>AppJoy: Personalized Mobile Application Discovery</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www.cs.uml.edu/%7Eglchen/papers/appjoy-mobisys-2011.pdf" target="_blank"&gt;&lt;i&gt;Bo Yan and Guanling Chen&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper introduces the author's app AppJoy, which collects users' usage of apps on a mobile device, derives the similarity scores between apps based on the usage and recommend apps based on similarity scores. The idea is to utilize the time and spatial information along with the apps. Contrastive to other recommendation algorithm, based on ratings, the app usage may serve as a much better feature. I guess this shows a simpler model with strong feature might serve better than a complicated model with weak features. &lt;br /&gt;&lt;br /&gt;It seems that, if we decide to move on to applications in mobile devices, we'd better get to know the details of the mobile platforms. E.g. IOS might not allow you to collect such usage information. So the experiments can only be done on Android.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-4573240528120346900?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/4573240528120346900/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=4573240528120346900' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4573240528120346900'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4573240528120346900'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/12/appjoy-personalized-mobile-application.html' title='AppJoy: Personalized Mobile Application Discovery'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-8260513175815134438</id><published>2011-12-31T20:33:00.001+08:00</published><updated>2011-12-31T20:33:07.539+08:00</updated><title type='text'>DOT: A Matrix Model for Analyzing, Optimizing and Deploying Software for Big Data Analytics in Distributed Systems</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs11-8.html" target="_blank"&gt;&lt;i&gt;Yin Huai, Rubao Lee, Simon Zhang, Cathy H. Xia and Xiaodong Zhang&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;I heard the talk in HIC 2011 but hardly knew what this paper is about. After reading the paper for a while, I realized that the author just formulates the jobs in a distributed system with a so-called DOT expression, i.e. data (row vector), operator (several column vectors) and transformation (another function on aggregated output of operators). Therefore a matrix-like expression can be formulated this way. Apparently Map/Reduce and dryad jobs can be formulated with it.&lt;br /&gt;&lt;br /&gt;The authour discussed a little bit about the property of DOT formulation but I think they are way behind math. I don't see any necessity in formulating those jobs with this strange expression. The algebra also doesn't reveal anything appealing. Maybe I am wrong. After all I am not in that field.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-8260513175815134438?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/8260513175815134438/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=8260513175815134438' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8260513175815134438'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8260513175815134438'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/12/dot-matrix-model-for-analyzing.html' title='DOT: A Matrix Model for Analyzing, Optimizing and Deploying Software for Big Data Analytics in Distributed Systems'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-3673341354687343954</id><published>2011-12-31T19:59:00.000+08:00</published><updated>2011-12-31T19:59:09.758+08:00</updated><title type='text'>Dryad: Distributed Data-Parallel Programs from Sequential Building Blocks</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://research.microsoft.com/pubs/63785/eurosys07.pdf" target="_blank"&gt;&lt;i&gt;Michael Isard, Mihai Budiu, Yuan Yu, Andrew Birrell and Dennis Fetterly&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;Dryad is kind of lower-level computational model than MapReduce. The programmer has to write a DAG for a specific task. The programmer implements the job details on each vertex and defines what kinf of edges that connect the vertices. In this way, the programmer first decomposes the tasks into a DAG. The commonly seen parallelization of splitting data has to be done manually or by some library to read data from a GFS like distributed storage. There is no shuffling like in reduce step. So programmer has to build his own for this type of operation (or dryad's library has such implementation). The comminication (i.e. the edges) can be fine tuned by the programmer, like a TCP pipe or a temporary file.&lt;br /&gt;&lt;br /&gt;On a whole dryad provides a lower level of computational model, on top of which other computational models (like Map/Reduce) can be built. But it also raises bar for its users. I don't think MS will open source for dryad and this is MS's style. Proprietary software, limited community...&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-3673341354687343954?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/3673341354687343954/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=3673341354687343954' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3673341354687343954'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3673341354687343954'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/12/dryad-distributed-data-parallel.html' title='Dryad: Distributed Data-Parallel Programs from Sequential Building Blocks'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-272659292239172941</id><published>2011-12-31T18:57:00.001+08:00</published><updated>2011-12-31T18:57:58.398+08:00</updated><title type='text'>Spark: Cluster Computing with Working Sets</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www.mesosproject.org/papers/hotcloud_spark.pdf" target="_blank"&gt;&lt;i&gt;Matei Zaharia, Mosharaf Chowdhury, Michael J. Franklin, Scott Shenker and Ion Stoica&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;Spark uses a so-called RDD (resilient distributed dataset) for data intensive operations, like iterative machine learning algorithms, which reduces repeated IO of MapReduce. Similar parallel operations like map, reduce, collect and foreach are defined. Reduce is even simpler (only associative operator, like plus). The dataset will be held in memory as long as the job is not finished. Parameterized models, when parammeters can be stored in memory, will surely benefit from this design.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-272659292239172941?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/272659292239172941/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=272659292239172941' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/272659292239172941'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/272659292239172941'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/12/spark-cluster-computing-with-working.html' title='Spark: Cluster Computing with Working Sets'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5707461937119615047</id><published>2011-11-06T16:29:00.000+08:00</published><updated>2011-11-06T17:29:48.840+08:00</updated><title type='text'>Swing or not to Swing: Learn When (not) to Advertise</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://research.yahoo.com/pub/2343" target="_blank"&gt;&lt;i&gt;Andrei Broder, Massimiliano Ciaramita et al.&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper talks about how to learn to show the ads or not in some cases. The basic idea is to learn a classification model based on candidate sets's features, like relevance feature, cohesiveness features. This is compared to a thresholding method, which can be trivially obtained via commonly used retrieval system of ads and proved more effective when our goal is not to maintain a high recall rate (therefore the more the better case).&lt;br /&gt;&lt;br /&gt;Another idea is to train the model based on click feedbacks instead of editorial judgement. The paper mentioned to use online learning techniques, which is the reason that most online serving models has to take into account of the time-evolving property and allows the model to be trained in an incremental style.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5707461937119615047?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5707461937119615047/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5707461937119615047' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5707461937119615047'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5707461937119615047'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/11/swing-or-not-to-swing-learn-when-not-to.html' title='Swing or not to Swing: Learn When (not) to Advertise'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5371172686048111311</id><published>2011-11-06T16:23:00.000+08:00</published><updated>2011-11-06T16:23:31.188+08:00</updated><title type='text'>Impedance Coupling in Content-Targeted Advertising</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://portal.acm.org/citation.cfm?id=1076119" target="_blank"&gt;&lt;i&gt;Berthier Ribeiro-Neto, Marco Cristo, Paulo B. Golgher and Edleno Silva de Moura&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper talks about an expansion of keywords in a webpages by using similar webpages. By this expansion, the retrieval of related ads can be improved. It is different from another idea, that is, the expansion set is mined from search engine.&lt;br /&gt;&lt;br /&gt;The basic model is Bayesian belief net, where for each document &lt;i&gt;D&lt;/i&gt;&lt;sub&gt;0&lt;/sub&gt;, we take its most similar pages &lt;i&gt;D&lt;sub&gt;k&lt;/sub&gt;&lt;/i&gt; and create links from the doc to its related terms. Originally, the doc's related terms are sparse when used in retrieval (due to vocabulary impedance problem). But with similar pages, the candidate sets get enlarged but still we need to select which are good for the current page. Take Pr(&lt;i&gt;D&lt;sub&gt;i&lt;/sub&gt;&lt;/i&gt;) as constant, and the conditional probabilities Pr(&lt;i&gt;R&lt;/i&gt; | &lt;i&gt;D&lt;sub&gt;i&lt;/sub&gt;&lt;/i&gt;) as some function related to the similarity to &lt;i&gt;D&lt;/i&gt;&lt;sub&gt;0&lt;/sub&gt; and Pr(&lt;i&gt;T&lt;sub&gt;j&lt;/sub&gt;&lt;/i&gt; | &lt;i&gt;D&lt;sub&gt;i&lt;/sub&gt;&lt;/i&gt;) as some normalized term frequencies. We may evaluate Pr(&lt;i&gt;T&lt;sub&gt;j&lt;/sub&gt;&lt;/i&gt; | &lt;i&gt;R&lt;/i&gt;) for each document as a ranking score to choose the corresponding term.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5371172686048111311?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5371172686048111311/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5371172686048111311' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5371172686048111311'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5371172686048111311'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/11/impedance-coupling-in-content-targeted.html' title='Impedance Coupling in Content-Targeted Advertising'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-6160896749490868771</id><published>2011-11-06T15:26:00.000+08:00</published><updated>2011-11-06T15:26:34.682+08:00</updated><title type='text'>Review Spotlight: A User Interface for Summerizing User-Generated Reviews Using Adjective-Noun Word Pairs</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://yatani.jp/paper/CHI2011.pdf" target="_blank"&gt;&lt;i&gt;Koji Yatani, Michael Novati, Andrew Trusty and Khai N. Truong&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This is an interesting HCI paper on showing interviews in the form of adj.+ n. form. The sentiment analysis shows the positiveness and negativeness of each mined structure, which renders the color of the phrases. The sizes can be determined by the popularity of the phrases. The interface helps the user get better impression on the product of interest. The layout of the phrases are not very important. When the mouse hovers the selected phrases, the corresponding reviews will be listed (maybe with a smart summary?).&lt;br /&gt;&lt;br /&gt;In a way, this interface has similarity in word/tag clouds but do has very reasonably advantageous edges. Shall we implement one for some projects?&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-6160896749490868771?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/6160896749490868771/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=6160896749490868771' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6160896749490868771'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6160896749490868771'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/11/review-spotlight-user-interface-for.html' title='Review Spotlight: A User Interface for Summerizing User-Generated Reviews Using Adjective-Noun Word Pairs'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-2375634832309972073</id><published>2011-06-19T15:19:00.000+08:00</published><updated>2011-06-19T15:19:16.191+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='SIGIR'/><title type='text'>SIGIR 2011</title><content type='html'>This time it is in Beijing.&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.sigir2011.org/tutorials.htm"&gt;&lt;b&gt;Tutorials&lt;/b&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;I'd say most of them are quite elementary:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;&lt;a href="http://www.sigir2011.org/machine-learning-for-ir-m.htm"&gt;one&lt;/a&gt; tells machine learning techniques for IR;&lt;/li&gt;&lt;li&gt;&lt;a href="http://www.sigir2011.org/logistic-regression-reloaded.htm"&gt;another&lt;/a&gt; talks about online algorithm, which is the now trending topic in ML;&lt;/li&gt;&lt;/ul&gt;&lt;br /&gt;&lt;a href="http://www.sigir2011.org/keynotes.htm"&gt;&lt;b&gt;Keynotes&lt;/b&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;It is quite general but should be interesting enough.&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.sigir2011.org/papers.htm"&gt;&lt;b&gt;Papers&lt;/b&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Not sure about the content. No online proceedings. Not sure we may go there and listen to some of the talks.&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.sigir2011.org/workshops.htm"&gt;&lt;b&gt;Workshops&lt;/b&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Some of them look interesting:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;&lt;a href="http://ciir.cs.umass.edu/sigir2011/qru/"&gt;query session&lt;/a&gt;&lt;/li&gt;&lt;li&gt;&lt;a href="http://research.microsoft.com/en-us/um/beijing/events/eos2011/"&gt;entity oriented search&lt;/a&gt;&lt;/li&gt;&lt;/ul&gt;&lt;br /&gt;Hope I may sneak in.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-2375634832309972073?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/2375634832309972073/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=2375634832309972073' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2375634832309972073'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2375634832309972073'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/06/sigir-2011.html' title='SIGIR 2011'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5442481311858142219</id><published>2011-06-19T11:16:00.000+08:00</published><updated>2011-06-19T11:16:45.403+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='stochastic gradient descent'/><category scheme='http://www.blogger.com/atom/ns#' term='social network'/><title type='text'>Like like alike---Joint Friendship and Interest Propagation in Social Networks</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www.cc.gatech.edu/%7Esyang46/papers/WWW11FIP.pdf"&gt;&lt;i&gt;Shuang Hong Yang, Bo Long, Alex Smola, Narayanan Sadagopan, Zhaohui Zheng and Hongyuan Zha&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper extends the idea of &lt;a href="http://paperscanner.blogspot.com/2011/06/regression-based-latent-factor-models.html"&gt;RLFM&lt;/a&gt;: we interpret RLFM with a supervised learning problem and this paper extends this idea with a social net, mainly via regularizers. So that's why the probabilistic model in RLFM may not be that important (just an interpretation in a disciplined way). There are 7 regularizers in total, which leads to a very complicated objective function.&lt;br /&gt;&lt;br /&gt;The optimization is solved via stochastic gradient descent. Maybe for other techniques like interior point method, it is too difficult to derive?&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5442481311858142219?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5442481311858142219/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5442481311858142219' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5442481311858142219'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5442481311858142219'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/06/like-like-alike-joint-friendship-and.html' title='Like like alike---Joint Friendship and Interest Propagation in Social Networks'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-7616210196862367521</id><published>2011-06-15T18:16:00.000+08:00</published><updated>2011-06-15T18:16:36.253+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='probabilistic graphical model'/><category scheme='http://www.blogger.com/atom/ns#' term='recommendation system'/><title type='text'>Regression-based Latent Factor Models</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www-ai.cs.uni-dortmund.de/PROCEEDINGS/SIKDD2009/docs/p19.pdf"&gt;&lt;i&gt;Deepak Agarwal and Bee-Chunh Chen&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper talks about another way of using the rating matrix.&lt;br /&gt;&lt;br /&gt;Given features of users and items, there are some correlations between them. The CCA idea is ``unsupervised'' since it works on two kinds of data and finds the maximum correlations without further ``supervised information.'' Then if indeed we have some ``supervised'' information, that is how correlated a user is to some items, the problem becomes a ``supervised learning'' version. Actually, most of the literatures are in this style. Haha, you might have realized my point now: is there any ``semi-supervised'' version? Yeah, I'd like to develop one :-)&lt;br /&gt;&lt;br /&gt;OK, let's go back to this paper. So with supervised information, the problem of CCA becomes finding two linear mapping of user's feature and item's feature into a common subspace, so that their inner product approximates the given one. This might be problematic, since obviously the ratings might be non-consistent with their relative face values. But anyway we have this basic idea: it's like what we get in PCA.&lt;br /&gt;&lt;br /&gt;The second step is to make some probabilistic model out of it (PCA to PPCA) so that everything gets some interpretation and a disciplined way to train the model and making inferences. The paper shows us an interpretation: there is a common subspace that each user/item is drawn from (zero mean, some variance?), which can be obtain from the original feature space (via a linear transformation). The observed rating can be seen as the inner product mixed with a noise. With this graphical model, we may do the learning via EM. Here the authors chose the MCEM (MC to compute the expectation).&lt;br /&gt;&lt;br /&gt;Let's try some of the ideas.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-7616210196862367521?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/7616210196862367521/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=7616210196862367521' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7616210196862367521'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7616210196862367521'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/06/regression-based-latent-factor-models.html' title='Regression-based Latent Factor Models'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-3023309664297500372</id><published>2011-06-13T11:52:00.000+08:00</published><updated>2011-06-13T11:52:06.332+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='social network'/><title type='text'>Inferring Rlevent Social Networks from Interpersonal Communication</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://research.yahoo.com/files/fp1010-dechoudhury.pdf"&gt;&lt;i&gt;Munmum de Choudhury, Winter A. Mason, Jake M. Hofman and Ducan J. Watts&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;The paper makes a point that the social nets are not really totally usable by its original form. We usually have to remove some edges by a certain extent for a specific tasks. My thought on this is like the social edges are like aqueduct or wires. Different behaviors have different propagation coefficients. &lt;br /&gt;&lt;br /&gt;Another important thing in this paper is those features, in three categories:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;reach&lt;ul&gt;&lt;li&gt;node degree&lt;/li&gt;&lt;li&gt;average neighbor degree&lt;/li&gt;&lt;li&gt;size of two-hop neighborhood&lt;/li&gt;&lt;/ul&gt;&lt;/li&gt;&lt;li&gt;closure&lt;ul&gt;&lt;li&gt;embeddedness, an average of ratios of common friends over or-friends;&lt;/li&gt;&lt;li&gt;normalized clustering coefficient, the average probability of two of my neighbors are friends&lt;/li&gt;&lt;/ul&gt;&lt;/li&gt;&lt;li&gt;bridge&lt;ul&gt;&lt;li&gt;network constraints (sum of all neighbors, each inner producted with the node)&lt;/li&gt;&lt;li&gt;ego components (after removal, how many new components will appear)&lt;/li&gt;&lt;/ul&gt;&lt;/li&gt;&lt;/ul&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-3023309664297500372?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/3023309664297500372/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=3023309664297500372' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3023309664297500372'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3023309664297500372'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/06/inferring-rlevent-social-networks-from.html' title='Inferring Rlevent Social Networks from Interpersonal Communication'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5648470431270434529</id><published>2011-06-13T07:44:00.000+08:00</published><updated>2011-06-13T07:44:32.128+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='sentiment analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='topic model'/><title type='text'>Sentiment Analysis with Global Topics and Local Dependency</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www.aaai.org/ocs/index.php/AAAI/AAAI10/paper/download/1913/2215"&gt;&lt;i&gt;Fangtao Li, Minlie Huang and Xiaoyan Zhu&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper talks about the interaction between topic and sentiment. The first idea is to impose an additional factor of generating word: the topic decides the sentiment and the sentiment along with the topic decides the word. However, the structure can't determine where the writer changes his attitude. So they impose a hidden Markov structure on sentiment transitions. However the paper only gives some inference result and many parameters are manually set.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5648470431270434529?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5648470431270434529/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5648470431270434529' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5648470431270434529'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5648470431270434529'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/06/sentiment-analysis-with-global-topics.html' title='Sentiment Analysis with Global Topics and Local Dependency'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-9163102128683206933</id><published>2011-06-06T23:46:00.000+08:00</published><updated>2011-06-06T23:46:20.650+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='optimization'/><title type='text'>An Efficient Method For Compressive Sensing</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.122.2942&amp;amp;rep=rep1&amp;amp;type=pdf"&gt;&lt;i&gt;Seung-Jean Kim, Kwangmoo Koh, Michael Lustig and Stephen Boyd&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper talks about how the interior point method is applied in L1 regularized least squared problem. The dual serves as an estimation of the duality gap, which then becomes the convergence judgement. The log barrier method formulates the primal objective function, which is solved by Newton's method. The Newtons's method requires solving a linear system, which addressed by conjugate gradient. The accuracy of the solution can be adjusted as the search evolves.&lt;br /&gt;&lt;br /&gt;On a single machine this method may deal with millions of variables, in compressive sensing problems.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-9163102128683206933?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/9163102128683206933/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=9163102128683206933' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/9163102128683206933'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/9163102128683206933'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/06/efficient-method-for-compressive.html' title='An Efficient Method For Compressive Sensing'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5147356132559741729</id><published>2011-06-06T23:08:00.000+08:00</published><updated>2011-06-06T23:08:56.804+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='theoretical science'/><category scheme='http://www.blogger.com/atom/ns#' term='hashing'/><category scheme='http://www.blogger.com/atom/ns#' term='algorithm'/><title type='text'>Near-Optimal Hashing Algorithms For Approximate Nearest Neighbor in High Dimensions</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://web.mit.edu/andoni/www/papers/CACM-article.ps"&gt;&lt;i&gt;Alexndr Andoni and Piotr Indyk&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This is a survey paper on locality-sensitive hashing (LSH). LSH is quite useful for approximately retrieving similar items according to a given distance. The fact is, once we have enough data, those approximate nearest neighbors would perform as well as extact ones. So why bother computing the exact version?&lt;br /&gt;&lt;br /&gt;The idea is based on stochastic algorithms in common literatures. The idea is to select a random projection and combine the binning to the final hash table. To understand intuitively how this works, given a point, the random projection into one bin will cut down the probability the retrieved points are not around the query by some extent. And when more projections are made, we get closer. E.g. for Hamming distance, the projection can be further simplified as a subset of randomly chosen coordinate indices.&lt;br /&gt;&lt;br /&gt;There are some software implementing this LSH idea. I would like to try them in the following days.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5147356132559741729?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5147356132559741729/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5147356132559741729' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5147356132559741729'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5147356132559741729'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/06/near-optimal-hashing-algorithms-for.html' title='Near-Optimal Hashing Algorithms For Approximate Nearest Neighbor in High Dimensions'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-2521244894526484000</id><published>2011-06-06T22:33:00.000+08:00</published><updated>2011-06-06T22:33:01.026+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='map/reduce'/><title type='text'>Efficient Large-Scale Distributed Training of Conditional Maximum Entropy Models</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://research.google.com/pubs/archive/35648.pdf"&gt;&lt;i&gt;Gideon Mann, Ryan McDonald, Mehryar Mohri, Nathan Silberman and Daniel D. Walker&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This works might inspired &lt;a href="http://paperscanner.blogspot.com/2011/06/parallelized-stochastic-gradient.html"&gt;the previous scanned paper&lt;/a&gt;, in that the proposed algorithm looks quite similar. This paper only focused on CME (a special case of convex optimization) and therefore the result is of comparatively limited usage in practice. The key difference of the two is this paper only employs a normal batch solver, unlike the stochastic solver in the latter.&lt;br /&gt;&lt;br /&gt;This algorithm is map/reduce friendly, though.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-2521244894526484000?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/2521244894526484000/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=2521244894526484000' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2521244894526484000'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2521244894526484000'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/06/efficient-large-scale-distributed.html' title='Efficient Large-Scale Distributed Training of Conditional Maximum Entropy Models'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-8134280088843793895</id><published>2011-06-06T22:24:00.000+08:00</published><updated>2011-06-06T22:24:27.534+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='stochastic gradient descent'/><category scheme='http://www.blogger.com/atom/ns#' term='map/reduce'/><title type='text'>Parallelized Stochastic Gradient Descent</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www.research.rutgers.edu/%7Elihong/pub/Zinkevich11Parallelized.pdf"&gt;&lt;i&gt;Martin A. Zinkevich, Markus Weimer, Alex Smola and Lihong Li&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper talks about a very interesting optimization technique. In online learning, SGD (stochastic gradient descent) is usually applied to optimize the model, since the data come in one by one (or one mini batch by another). For parametric models, the parameters are updated with one or a few samples sequentially.&lt;br /&gt;&lt;br /&gt;This paper talks about a parallelized version. Essentially we run several SGD on different machines and aggregate their result by averaging. However, we may even do not distribute all data across all machines. The proof of convergence looks dependent on the convexity of the objective functions but I suspect it may not.&lt;br /&gt;&lt;br /&gt;The result is quite interesting when we consider about the popular parallel computation framework, map/reduce. We'd better implement one ASAP.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-8134280088843793895?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/8134280088843793895/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=8134280088843793895' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8134280088843793895'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8134280088843793895'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/06/parallelized-stochastic-gradient.html' title='Parallelized Stochastic Gradient Descent'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-1782978326652902011</id><published>2011-01-21T18:45:00.001+08:00</published><updated>2011-01-21T18:49:24.242+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='semi-supervised learning'/><category scheme='http://www.blogger.com/atom/ns#' term='random walk'/><title type='text'>Random Walks for Image Segmentation</title><content type='html'>&lt;div style="text-align: right;"&gt;by&lt;i&gt; &lt;a href="http://cns-web.bu.edu/%7Elgrady/grady2006random.pdf"&gt;Leo Grady&lt;/a&gt;&lt;/i&gt;&lt;/div&gt;&lt;br /&gt;This paper proposed a random walk based algorithm for image segmentation. The idea is quite straightforward: some part of the images are labeled by the user and other areas are segmented using the random walk probability that stops at these landmarks.&lt;br /&gt;&lt;br /&gt;The interesting part of the paper is that it proposed a (several) linear system to solve the probability, instead of solving an eigenvalue problem.&lt;br /&gt;&lt;pre class="eq.latex"&gt;L_U X = -B&lt;/pre&gt;where &lt;code class="eq.latex"&gt;L_U&lt;/code&gt; is the graph Laplacian of the unlabelled samples and the &lt;code class="eq.latex"&gt;B&lt;/code&gt; is the cross block in the Laplacian of the whole graph.&lt;br /&gt;&lt;br /&gt;The idea was used in t-SNE to build p-table for landmarks.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-1782978326652902011?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/1782978326652902011/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=1782978326652902011' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/1782978326652902011'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/1782978326652902011'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/01/random-walks-for-image-segmentation.html' title='Random Walks for Image Segmentation'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-8284536384192132959</id><published>2011-01-14T15:13:00.000+08:00</published><updated>2011-01-14T15:13:42.888+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='semi-supervised learning'/><category scheme='http://www.blogger.com/atom/ns#' term='label propagation'/><category scheme='http://www.blogger.com/atom/ns#' term='multiple-label learning'/><title type='text'>Annotating Photo Collections by Label Propagation According to Multiple Similarity Cues</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;i&gt;&lt;a href="http://www.ifp.illinois.edu/%7Ecao4/papers/MM08_propagation.pdf"&gt;Liangliang Cao, Jiebo Luo and Thomas Huang&lt;/a&gt;&lt;/i&gt;&lt;/div&gt;&lt;br /&gt;The image annotation problems is actually a multi-label propagation, unlike three previous papers. We are only interested in the label propagation part. Actually it is only a weighted counting for independent labels. Nothing new :-(&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-8284536384192132959?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/8284536384192132959/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=8284536384192132959' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8284536384192132959'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8284536384192132959'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/01/annotating-photo-collections-by-label.html' title='Annotating Photo Collections by Label Propagation According to Multiple Similarity Cues'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-1475537074842922807</id><published>2011-01-13T14:46:00.000+08:00</published><updated>2011-01-13T14:46:12.736+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='LLE'/><category scheme='http://www.blogger.com/atom/ns#' term='semi-supervised learning'/><category scheme='http://www.blogger.com/atom/ns#' term='label propagation'/><title type='text'>Semi-supervised Classification Using Linear Neighborhood Propagation</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://ieeexplore.ieee.org/iel5/10924/34373/01640755.pdf"&gt;&lt;i&gt;Fei Wang, Jingdong Wang, Changshui Zhang and Helen C. Shen&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;The so-called linear neighborhood propagation relies on the idea from LLE. Instead of using the graph weights directly, as in former methods (c.f. Zhou and Zhu's papers, two previous paper), the weights are computed using the idea from LLE. Therefore, in a way Zhou's version is something like diffusion map, Zhu's Laplacian eigenmap while this one LLE. We may find those counterparts in manifold learning.&lt;br /&gt;&lt;br /&gt;The procedure to calculate the weights are identical to that of LLE (by minimizing the affine reconstruction error). Then the weights are used to propagate the labels with the same objective function as in semi-supervised LLE (or landmark LLE). In this way, it eliminates the selection for a width for Gaussian kernels.&lt;br /&gt;&lt;br /&gt;Well, why not make a LTSA version? Haha...&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-1475537074842922807?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/1475537074842922807/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=1475537074842922807' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/1475537074842922807'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/1475537074842922807'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/01/semi-supervised-classification-using.html' title='Semi-supervised Classification Using Linear Neighborhood Propagation'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-6327178811787120347</id><published>2011-01-13T14:20:00.000+08:00</published><updated>2011-01-13T14:20:50.188+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='semi-supervised learning'/><category scheme='http://www.blogger.com/atom/ns#' term='label propagation'/><title type='text'>Semi-supervised Learning Using Gaussian Fields and Harmonic Functions</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://pages.cs.wisc.edu/%7Ejerryzhu/pub/zgl.pdf"&gt;&lt;i&gt;Xiaojin Zhu, Zoubin Ghahramani and John Lafferty&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This is also a piece of even earlier work on label propagation in semi-supervised setting. There are many fancy nouns, Gaussian random fields, energy minimization and harmonics... But it ends like something resembling the previous paper.&lt;br /&gt;&lt;br /&gt;They explain the label propagation as the minimization of the so-called energy function of a Gaussian field, which is actually the quadratic form of the Laplacian and therefore corresponds to the so-call elliptic PDE (and hence many more nouns: Green function, harmonics). The little difference might be they are using &lt;code class="eq.latex"&gt;D^{-1} W&lt;/code&gt; (one-step transition matrix) instead of the symmetrized and normalized version &lt;code class="eq.latex"&gt;D^{-1/2} W D^{-1/2}&lt;/code&gt; in the previously scanned paper. The closed form solution is obtained via Schur complement instead of the iterative procedure in Zhou's paper.&lt;br /&gt;&lt;br /&gt;The paper also discusses connections with random walks and electric networks, graph kernels, spectral clustering and N-cut. I believe due to the connection, this propagation idea must have been discussed with all kinds of variants (normalized? symmetrized?)&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-6327178811787120347?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/6327178811787120347/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=6327178811787120347' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6327178811787120347'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6327178811787120347'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/01/semi-supervised-learning-using-gaussian.html' title='Semi-supervised Learning Using Gaussian Fields and Harmonic Functions'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-3049704046815387950</id><published>2011-01-12T14:14:00.001+08:00</published><updated>2011-01-12T14:21:25.099+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='semi-supervised learning'/><category scheme='http://www.blogger.com/atom/ns#' term='label propagation'/><title type='text'>Learning with Local and Glabal Consistency</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://books.nips.cc/papers/files/nips16/NIPS2003_AA41.pdf"&gt;Dengyong Zhou, Olivier Bousquet, Thomas Navin Lal, Jason Weston and Bernhard Scholkopf&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper is about label propagation in semi-supervised learning setting. The basic idea about label propagation is to use a graph as a random walk structure. The label is propagated with the following equation &lt;br /&gt;&lt;pre class="eq.latex"&gt;Y(t + 1) = \alpha L Y(t) + (1 - \alpha) Y(t)&lt;/pre&gt;where &lt;code class="eq.latex"&gt;Y(t)&lt;/code&gt; is the label matrix (in multi-class classification, it is a unit vector for labeled sample) and &lt;code class="eq.latex"&gt;S&lt;/code&gt; is the normalized weight matrix. The iterative procedure will result in a limit of &lt;br /&gt;&lt;pre class="eq.latex"&gt;Y^\star = (I - \alpha S)^{-1} Y&lt;/pre&gt;This reveals some connection with the graph Laplacian. &lt;br /&gt;Th interesting part is how we may develop a parallel version of this idea.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-3049704046815387950?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/3049704046815387950/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=3049704046815387950' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3049704046815387950'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3049704046815387950'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/01/learning-with-local-and-glabal.html' title='Learning with Local and Glabal Consistency'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5030198850887116537</id><published>2011-01-12T10:48:00.000+08:00</published><updated>2011-01-12T10:48:24.818+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='label propagation'/><category scheme='http://www.blogger.com/atom/ns#' term='multiple-label learning'/><title type='text'>Correlated Label Propagation with Application to Multi-label Learning</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;i&gt;&lt;a href="http://portal.acm.org/citation.cfm?id=1153652" linkindex="350"&gt;Feng Kang, Rong Jin and Rahul Sukthankar&lt;/a&gt;&lt;/i&gt;&lt;/div&gt;&lt;br /&gt;This paper talks about multi-label propagation. Usually label propagation is commonly seen in semi-supervised learning tasks. As for multi-labeled tasks, the critical difference is that labels may have correlationships (co-occurrences).&lt;br /&gt;&lt;br /&gt;The paper proposes a linear programming model for the task &lt;br /&gt;&lt;pre class="eq.latex"&gt;\max_{z \in \mathbb{R}^n} \sum_{k = 1}^n \alpha_k z_k \qquad \text{s.t. } \forall t \in \{ 0, 1\}^n, z^\top t \leq \sum_{i = 1}^N K(x_i, x_q) \Omega(t^\top t(S_i)), z \succeq 0&lt;/pre&gt;where &lt;code class="eq.latex"&gt;\Omega&lt;/code&gt; is a concave function. The interpretation of this objective function is: for any possible labelling of the test sample, the propagated score is bounded from above by weighted (by the similarity, depicted by &lt;code class="eq.latex"&gt;K(\cdot, \cdot)&lt;/code&gt;) sum of score from those samples sharing labels.&lt;br /&gt;The amazing thing about the objective function is that a greedy algorithm can find the optimal when &lt;code class="eq.latex"&gt;\Omega&lt;/code&gt; is concave and the solution is only determined by the order of the undetermined coefficients &lt;code class="eq.latex"&gt;\alpha_k&lt;/code&gt;. This optimization is related to the so-called sub-modular optimization (see &lt;a href="https://secure.wikimedia.org/wikipedia/en/wiki/Submodular" linkindex="351"&gt;here&lt;/a&gt;).&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5030198850887116537?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5030198850887116537/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5030198850887116537' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5030198850887116537'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5030198850887116537'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2011/01/correlated-label-propagation-with.html' title='Correlated Label Propagation with Application to Multi-label Learning'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-4585784495516788195</id><published>2010-12-19T15:02:00.001+08:00</published><updated>2011-01-12T10:30:08.702+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='aspecti mode'/><category scheme='http://www.blogger.com/atom/ns#' term='real-time answer'/><title type='text'>The Anatomy of a Large-scale Social Search Engine</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://vark.com/aardvarkFinalWWW2010.pdf"&gt;&lt;i&gt;Damon Horowitz and Sapandar D. Kamvar&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This is a WWW10 paper on real-time answer. The idea is to build a village model of knowledge sharing instead of the traditional model of library, e.g. the search engine solution provided by Google.&lt;br /&gt;&lt;br /&gt;The users of aardvark have their social graph information collected from several sources: e.g. facebook friends, email contacts, IM contacts and etc, in which the users' questions will be propagated. The users must specify their expertise: either specifying by selecting some items or provide some publishing information to analyze (e.g. twitter, blog). The system builds two indices, ISAM index for the social graph and an inverted index for user's expertise and then enables the user's behavior.&lt;br /&gt;&lt;br /&gt;The user's query will be analyzed (to see whether it is a question or not and what topic it is) and give the user a chance to determine the type (since it is immature to automatically determine the topic so far). Then a proper question will be handled by the routing suggestion using the social graph and expertise information.&lt;br /&gt;&lt;br /&gt;Therefore, the core of the village model is the routing algorithm. The routing procedure is actually the same as a ranking (of users) problem: &lt;br /&gt;&lt;pre class="eq.latex"&gt;s(u_i, u_j, q) = \Pr(u_j \mid u_i) \Pr(u_j \mid q)&lt;/pre&gt;where the first term is measured via users intimacy using the social network (regarding to social connection, demographic similarity, profile similarity, vocabulary match, chattiness, verbosity, politeness and speed) and the second term is learned with an aspecti model (just as pLSA). In practice, the probability &lt;code class="eq.latex"&gt;\Pr(t \mid u_i)&lt;/code&gt; is smoothed over the social nets, since if one's friends know something, he either knows it or knows who to ask about.&lt;br /&gt;The rank engine works as follows: it retrives those users with matched expertise (if the question is location sensitive, this would also be considered in the retrieval); secondly it uses the connectedness to find the one with a proper relationship and lastly it computes whether the query could be dealt by the user using availability information.&lt;br /&gt;&lt;br /&gt;The rest are many small pieces we have to put together:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;whether the proposed text is actually a question? (need a classifier)&lt;/li&gt;&lt;li&gt;whether the question is trivial? (we may have vertical search engines for retrieving the desired result without asking someone)&lt;/li&gt;&lt;li&gt;whether the question is location sensitive?&lt;/li&gt;&lt;/ul&gt;&lt;br /&gt;The whole platform is kind of difficult to construct but the idea is somewhat easy to grasp.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-4585784495516788195?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/4585784495516788195/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=4585784495516788195' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4585784495516788195'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4585784495516788195'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/12/anatomy-of-large-scale-social-search.html' title='The Anatomy of a Large-scale Social Search Engine'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5484088981105632134</id><published>2010-12-08T20:07:00.000+08:00</published><updated>2010-12-08T20:07:15.513+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='novel idea'/><category scheme='http://www.blogger.com/atom/ns#' term='clustering'/><title type='text'>Search Logs as Information Footprints: Supporting Guided Navigation for Exploratory Search</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="https://www.ideals.illinois.edu/bitstream/handle/2142/11489/Search%20Logs%20as%20Information%20Footprints%20Supporting%20Guided%20Navigation%20for%20Exploratory%20Search.pdf?sequence=2" linkindex="17"&gt;&lt;i&gt;Xuanhui Wang, Bin Tan, Azadeh Shakery, ChengXiang Zhai&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This is a paper on building a exploration interface based on a clustering algorithm (star clustering), that can arrange search queries according to the similarities of the search queries into tree structure. Then the material is shown to the users in a tree structured interface, incorporating search utility.&lt;br /&gt;&lt;br /&gt;It's quite similar to what we are doing and may be I will develop another parallel clustering algorithm based on affinity propagation. But let's first scan the result of star clustering.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5484088981105632134?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5484088981105632134/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5484088981105632134' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5484088981105632134'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5484088981105632134'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/12/search-logs-as-information-footprints.html' title='Search Logs as Information Footprints: Supporting Guided Navigation for Exploratory Search'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-6414663290708587912</id><published>2010-12-06T01:10:00.000+08:00</published><updated>2010-12-06T01:10:51.924+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='decision tree'/><category scheme='http://www.blogger.com/atom/ns#' term='large-scale problem'/><category scheme='http://www.blogger.com/atom/ns#' term='map/reduce'/><category scheme='http://www.blogger.com/atom/ns#' term='MPI'/><title type='text'>The Stochastic Gradient Boosted Distributed Decision Trees</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://portal.acm.org/citation.cfm?id=1646301" linkindex="90"&gt;&lt;i&gt;Jerry Ye, Jyh-Herng Chow, Jiang Chen and Zhaohui Zheng&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper proposed two solutions forr implementing the exact stochastic GBDT, which was developed by the famous statician Friendman. I'd like to scan his two previous papers later as the first study of decision trees (others will be CART and C4.5... I guess).&lt;br /&gt;&lt;br /&gt;The map/reduce implementation is based on the previously scanned paper, using horizontal splits while in the MPI implementation using vertical splits. The former is quite directly; the later requires communication using all-to-all broadcasting.&lt;br /&gt;&lt;br /&gt;Maybe after studying the GBDT, I would have a better understanding of this paper.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-6414663290708587912?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/6414663290708587912/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=6414663290708587912' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6414663290708587912'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6414663290708587912'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/12/stochastic-gradient-boosted-distributed.html' title='The Stochastic Gradient Boosted Distributed Decision Trees'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-2824172851707934031</id><published>2010-12-06T01:01:00.000+08:00</published><updated>2010-12-06T01:01:09.558+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='decision tree'/><category scheme='http://www.blogger.com/atom/ns#' term='large-scale problem'/><category scheme='http://www.blogger.com/atom/ns#' term='map/reduce'/><title type='text'>A Framework for Learning from Dsitributed Data Using Sufficient Statistics and its Application to Learning Decision Trees</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www.cs.iastate.edu/%7Ehonavar/Papers/ijhis.pdf"&gt;&lt;i&gt;Doina Caragea, Adrian Silvescu and Vasant Honavar&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper addresses training a large-scale problem using two types of split of data: horizontal fragmentation (sample subsets) and vertical fragmentation (feature subsets). The orientation is determined when writing row-wise sample matrix.&lt;br /&gt;&lt;br /&gt;The key idea behind this paper is to extract sufficient statistics from&amp;nbsp; splits of data so that we may aggregate the statistics in the last to get the exact model. This is quite direct in the case of MoE (when each&amp;nbsp; expert is an exponential family). This paper, however, has the emphasis in decision trees.&lt;br /&gt;&lt;br /&gt;The decision trees are trained layer-wise, finding splits of data using features that maximized the information gain (or some similar criteria). The paper discussed two cases:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;if the data are horizontally splitted, each site compute the required statistics and they are all combine to find the best split; the split then is passed back to each site so that each site knows which node the samples are in and they could take a second collection of sufficient stats.&lt;/li&gt;&lt;li&gt;if the data are vertically splitted, each site could figure out its own split and they will be compared and find the best; however, the split is then represented by the indices (so that each site may understand which samples are in which node);&lt;/li&gt;&lt;/ul&gt;&lt;br /&gt;This solution is accurate and can be applied to implementing the decision trees under map/reduce framework directly. And it should also apply to many hierarchical models too.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-2824172851707934031?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/2824172851707934031/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=2824172851707934031' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2824172851707934031'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2824172851707934031'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/12/framework-for-learning-from-dsitributed.html' title='A Framework for Learning from Dsitributed Data Using Sufficient Statistics and its Application to Learning Decision Trees'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5462527951944179085</id><published>2010-11-21T20:08:00.000+08:00</published><updated>2010-11-21T20:08:57.098+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='transfer learning'/><category scheme='http://www.blogger.com/atom/ns#' term='Markov logic'/><title type='text'>Deep Transfer vis Second-Order Markov Logic</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://turing.cs.washington.edu/papers/davis_icml2009.pdf"&gt;&lt;i&gt;Jesse Davis and Pedro Domingos&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;Transfer learning is a topic I am not familiar with. It is said to cope with training with data that differs from the testing data. There are two extents of transfer learning: so called shallow transfer that deals with different distributions in training and testing (in the same domain) and so called deep transfer that deals with different domains in training and testing.&lt;br /&gt;&lt;br /&gt;The deep transfer is possible only because different domains shares the same logic while this is actually I think Markov logic should be able to play an important role. The paper explains why we must use second-order logic (due to finding domain-independent knowledge) and relational (for transfer learning). Their proposed algorithm is DTM (deep transfer via Markov logic).&lt;br /&gt;&lt;br /&gt;For the experiments, the authors uses three domains, which seem no-in-the-least related to each other (yeast protein, webkb and social nets data from facebook). I am still not sure whether these experiments really show how their transfers work. Maybe we should return to this paper after a more careful study.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5462527951944179085?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5462527951944179085/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5462527951944179085' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5462527951944179085'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5462527951944179085'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/11/deep-transfer-vis-second-order-markov.html' title='Deep Transfer vis Second-Order Markov Logic'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-4788552489870704625</id><published>2010-11-21T19:37:00.000+08:00</published><updated>2010-11-21T19:37:26.994+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Markov logic'/><category scheme='http://www.blogger.com/atom/ns#' term='relational learning'/><title type='text'>Statistical Predicate Invention</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www.cs.washington.edu/homes/pedrod/papers/mlc07.pdf"&gt;&lt;i&gt;Stanley Kok and Pedro Domingos&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper talks about how to find statistical predicate (SPI problems, statistical predicate invention). This, from another talk given by Domingos, is equivalent to find latent variables in traditional probabilistical learning, one of the ten most important problems in the following decades in machine learning. The setting of this paper is second-order Markov logic.&lt;br /&gt;&lt;br /&gt;Their proposed approach for this problem is MRC (multiple relational clustering). The multiple relational clustering is interpreted via a simple example in the paper: one's technical skills and hobbies should be mined from different groups (clusters) of people, e.g. coworkers may share similar technical skills while friends share similar hobbies. The relational clustering is to find the latent relationship between people, and therefore ultimately finds who are coworkers, friends (latent predicates or r.v.). The resulting algorithm is not easy to understand. It looks like a clustering algorithm in logic language.&lt;br /&gt;&lt;br /&gt;There is another piece of work (infinite relational model) using CRP in relational modelling, which I think should be very interesting. We will try to see the details later.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-4788552489870704625?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/4788552489870704625/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=4788552489870704625' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4788552489870704625'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4788552489870704625'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/11/statistical-predicate-invention.html' title='Statistical Predicate Invention'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-6667469578426814595</id><published>2010-11-13T21:56:00.000+08:00</published><updated>2010-11-13T21:56:01.998+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='ranking'/><title type='text'>The PageRank Citation Ranking: Bringing Order to the Web</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf"&gt;&lt;i&gt;Lawrence Page, Sergey Brin, Rajeev Motwani and Terry Winograd&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This might be the famous paper introducing the PageRank to the ranking research and the famous search engine Google to the Internet. The key idea behind the PageRank that differentiate it from the back link counts is that a back link from an authorized site should be more valuable. Therefore the backlinks must be weighted by its own rank. So the last scanned paper's recursion makes sense.&lt;br /&gt;&lt;br /&gt;A more interesting question is how to make a distributed version. I think in a way this is equivalent to solving some linear system but I haven't really try to derive it.&lt;br /&gt;&lt;br /&gt;This could be a contributing factor for real ranking algorithm (which uses many other features as well).&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-6667469578426814595?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/6667469578426814595/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=6667469578426814595' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6667469578426814595'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6667469578426814595'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/11/pagerank-citation-ranking-bringing.html' title='The PageRank Citation Ranking: Bringing Order to the Web'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-3390807472479578347</id><published>2010-11-13T20:32:00.002+08:00</published><updated>2010-11-13T20:32:11.928+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='multiple-instance learning'/><title type='text'>Multiple Instance Learning for Computer Aided Diagnosis</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://books.nips.cc/papers/files/nips19/NIPS2006_0624.pdf"&gt;&lt;i&gt;Glenn Fung, Murat Dundar, Balaji Krishnapuram, R. Bharat Rao&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper talks about MIL poblems, addressing them by a convex hull based algorithm. The key idea comes from a former proposed relaxation (in another paper). In MIL, the basic assumption is that each positive bag contains at least one positive sample. The relaxation of this assumption is that in the convex hull of each positive bag, there exists a point that could be correctly classified.&lt;br /&gt;&lt;br /&gt;With this relaxation, the author proposed a general form for MIL, with which many known algorithms can be formulated, e.g. SVM and Fisher discriminant criterion for MIL. On a whole this is an application paper.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-3390807472479578347?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/3390807472479578347/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=3390807472479578347' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3390807472479578347'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3390807472479578347'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/11/multiple-instance-learning-for-computer.html' title='Multiple Instance Learning for Computer Aided Diagnosis'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-8436597767776637742</id><published>2010-11-10T09:20:00.000+08:00</published><updated>2010-11-10T09:20:55.180+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='HITS'/><category scheme='http://www.blogger.com/atom/ns#' term='PageRank'/><category scheme='http://www.blogger.com/atom/ns#' term='ranking'/><title type='text'>The Intelligent Surfer: Probabilistic Combination of Link and Content Information in PageRank</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://ai.cs.washington.edu/www/media/papers/richardson-domingos02a.pdf"&gt;&lt;i&gt;Matthew Richardson and Pedro Domingos&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;I started to read those papers by Domingos to follow their idea on Markov logic. Now I have already read a few pages from their tutorials and wish to know more about their past research background.&lt;br /&gt;&lt;br /&gt;This paper talks about ranking, which I am not quite familiar with. This paper introduces two previous methods, i.e. HITS and PageRank, which I'd like to scan later. The HITS model is little bit complicated and can't be served online (they have to compute hubs and authorities at query time) while PageRank can utilize the PageRank at offline stage (i.e. after the crawling, they may compute the PageRank using incoming links and outgoing links, which later serves as a factor contributing to the final ranking function).&lt;br /&gt;&lt;br /&gt;The graph PageRank builds consists of links:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;using the outgoing links and incoming links;&lt;/li&gt;&lt;li&gt;if a webpage has no outgoing links, it links to all other pages;&lt;/li&gt;&lt;/ul&gt;So we may set a random walker on the graph and find the probability it gets to the page using the formula&lt;br /&gt;&lt;pre class="eq.latex"&gt;\displaystyle P(j) = \frac{1 - \beta}{N} + \beta \sum_{i \in B(j)} \frac{P(i)}{|F_i|}&lt;/pre&gt;where &lt;code class="eq.latex"&gt;B(j)&lt;/code&gt; contains all pages link to page &lt;code class="eq.latex"&gt;j&lt;/code&gt; and &lt;code class="eq.latex"&gt;F(i)&lt;/code&gt; contains the pages linked from page &lt;code class="eq.latex"&gt;i&lt;/code&gt;.&lt;br /&gt;This paper's main idea is to incorporate the query information into the PageRank, which, you may have already seen from tabove, contains nothing about the query &lt;code class="eq.latex"&gt;q&lt;/code&gt;. Let&lt;br /&gt;&lt;pre class="eq.latex"&gt;P_q(j) = (1 - \beta) P_q'(j) + \beta \sum_{i \in B_j} P_q(i) P_q(i \to j)&lt;/pre&gt;, where those query related terms are all derived from query relevance scores, e.g.&lt;pre class="eq.latex"&gt;P_q'(j) = R_q(i) / \sum_{i \in W}R_q(i), \qquad P_q(i \to j) = R_q(i) / \sum_{j \in F_i} R_q(j)&lt;/pre&gt;.&lt;br /&gt;To overcome the computation problem, the author suggests we pre-compute the ranking score for the search queries offline.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-8436597767776637742?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/8436597767776637742/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=8436597767776637742' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8436597767776637742'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8436597767776637742'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/11/intelligent-surfer-probabilistic.html' title='The Intelligent Surfer: Probabilistic Combination of Link and Content Information in PageRank'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-3572835355966961525</id><published>2010-10-07T10:30:00.000+08:00</published><updated>2010-10-07T10:30:04.905+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='bundle method'/><category scheme='http://www.blogger.com/atom/ns#' term='optimization'/><category scheme='http://www.blogger.com/atom/ns#' term='solver'/><title type='text'>Bundle Methods for Regularized Risk Minimization</title><content type='html'>by Choon Hui Teo, S.V.N. Vishwanathan, Alex Smola and Quoc V. Le&lt;br /&gt;&lt;br /&gt;This paper introduces a bundle methods for RRM. In optimization, the standard optimization technique that derives bundle method is cutting plane method.&lt;br /&gt;&lt;br /&gt;In cutting plane methods, the convex functions are bounded from below by a series of cutting planes, constructed via subgradients. If we have many of them, they may reduce the space to search for optima. And actually, we select the next point using &lt;code lang="eq.latex"&gt;w_t = \arg\min_w J_t^\text{CP} (w)&lt;/code&gt; where &lt;code lang="eq.latex"&gt;J_t^\text{CP}(w) = \max_{i} J(w_{i-1}) + \langle w - w_{i-1}, s_i \rangle&lt;/code&gt;.&lt;br /&gt;&lt;br /&gt;The cutting plane method suffers from slow convergence. Bundle methods improve it with proximal functions (basically quadratic functions to approximate the objective function). There are several ways of constructing the proximal functions:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;proximal: &lt;code lang="eq.latex"&gt;w_t = \arg\min_w \frac{\zeta_t}{2} \| w - \hat{w}_{t-1} \|^2 + J_t^\text{CP} (w)&lt;/code&gt;&lt;/li&gt;&lt;li&gt;trust region: &lt;code lang="eq.latex"&gt;w_t = \arg\min_w \{ J_t^\text{CP}(w) \mid \frac{1}{2} \|w - \hat{w}_{t-1} \|^2 \leq \kappa_t\}&lt;/code&gt;&lt;/li&gt;&lt;li&gt;level set: &lt;code lang="eq.latex"&gt;w_t = \arg\min_w \{ \frac{1}{2} \| w - \hat{w}_{t-1} \|^2 \mid J_t^\text{CP} (w) \leq \tau_t\}&lt;/code&gt;&lt;/li&gt;&lt;/ul&gt;&lt;br /&gt;This paper proposed another bundle methods, yielding a &lt;code lang="eq.latex"&gt;O(log(1/\epsilon))&lt;/code&gt; convergence bound. It has several variant, one of which doesn't involve line search.&lt;br /&gt;&lt;br /&gt;Unfortunately, yourequation.com is down due to heavy request. I have to switch to another equation rendering method ASAP.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-3572835355966961525?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/3572835355966961525/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=3572835355966961525' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3572835355966961525'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3572835355966961525'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/10/bundle-methods-for-regularized-risk.html' title='Bundle Methods for Regularized Risk Minimization'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-3931790975788282673</id><published>2010-10-07T01:51:00.000+08:00</published><updated>2010-10-07T01:51:36.475+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Gibbs sampler'/><category scheme='http://www.blogger.com/atom/ns#' term='topic model'/><title type='text'>PLDA: Parallel Latent Dirichlet Allocation for Large-Scale Applications</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;i&gt;&lt;a href="http://plda.googlecode.com/files/aaim.pdf"&gt;Yi Wang, Hongjie Bai, Matt Stanton, Wen-yen Chen and Edward Y. Chang&lt;/a&gt;&lt;/i&gt;&lt;/div&gt;&lt;br /&gt;This paper discussed two implementations of PLDA, one using MPI and another using Map/Reduce. It seems that the Map/Reduce framework in google doesn't support iteration too. The comparison of LDA implementations via variational Bayesian, EP and Gibbs sampler was mentioned. But somehow quite strange, I would like to make the comparison myself sometime later.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Some models can be naturally extended into its parallel version, such as many monte-carlo methods. In the authors' implementation I see many similarity as another version by my colleagues. Wen-yen now is also one of my colleagues now, small world!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-3931790975788282673?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/3931790975788282673/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=3931790975788282673' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3931790975788282673'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3931790975788282673'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/10/plda-parallel-latent-dirichlet.html' title='PLDA: Parallel Latent Dirichlet Allocation for Large-Scale Applications'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-1870565924512510619</id><published>2010-10-05T17:04:00.000+08:00</published><updated>2010-10-05T17:04:39.697+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='dynamics'/><title type='text'>Templates and Anchors: Neuromechanical Hypothesis of Legged Locomotion on Land</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;i&gt;&lt;a href="http://repository.upenn.edu/cgi/viewcontent.cgi?article=1514&amp;amp;context=ese_papers" linkindex="92"&gt;R.J. Full and D.E. Koditschek&lt;/a&gt;&lt;/i&gt;&lt;/div&gt;&lt;br /&gt;I am not a biologist and don't quite understand the point. Generally I think they are discussing using templates to simplify the modelling of complicated locomotion of organism. They proposed two templates, SLIP (spring-loaded inverted pendulum) and LLS (lateral leg spring), trying to justify their model in biology, maybe. I can hardly find any equations in their paper, so I guess the math required to model these templates should be comparatively simple. Maybe using the Lagrangian, we may obtain the motion equation easily?&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-1870565924512510619?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/1870565924512510619/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=1870565924512510619' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/1870565924512510619'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/1870565924512510619'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/10/templates-and-anchors-neuromechanical.html' title='Templates and Anchors: Neuromechanical Hypothesis of Legged Locomotion on Land'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-1303681085642603558</id><published>2010-10-05T16:43:00.001+08:00</published><updated>2010-10-05T16:44:08.374+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='stochastic gradient descent'/><category scheme='http://www.blogger.com/atom/ns#' term='large-scale problem'/><category scheme='http://www.blogger.com/atom/ns#' term='hashing'/><category scheme='http://www.blogger.com/atom/ns#' term='collaborative filtering'/><title type='text'>Collaborative Filtering on a Budget</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;i&gt;&lt;a href="http://jmlr.csail.mit.edu/proceedings/papers/v9/karatzoglou10a/karatzoglou10a.pdf"&gt;Alexandros Karatzoglou, Alex Smola and Markus Weimer&lt;/a&gt;&lt;/i&gt;&lt;/div&gt;&lt;br /&gt;This paper talks about dealing with large scale collaborative filtering. The collaborative filtering can be formulated as a matrix factorization problem and we may try several loss functions with different regularizer. One typical example is the &lt;a href="http://paperscanner.blogspot.com/2008/01/maximum-margin-matrix-factorization.html" linkindex="16"&gt;M3F paper&lt;/a&gt; previously scanned here. A convenient solver is stochastic gradient descent.&lt;br /&gt;&lt;br /&gt;The core idea proposed is we may use two hash functions (one for user and another for items recommended) to aggregate the user matrix and item matrix to eliminate the computational cost in large-scale problems. These matrices are approximated with the help of Rademacher functions. But I have no idea why this is possible. Maybe I will take a look some day later.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-1303681085642603558?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/1303681085642603558/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=1303681085642603558' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/1303681085642603558'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/1303681085642603558'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/10/collaborative-filtering-on-budget.html' title='Collaborative Filtering on a Budget'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-354664883173906971</id><published>2010-10-02T21:42:00.000+08:00</published><updated>2010-10-02T21:42:28.994+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='dynamics'/><title type='text'>Spacetime Constraints</title><content type='html'>&lt;div style="text-align: right;"&gt;by Andrew Witkin and Michael Kass&lt;/div&gt;&lt;br /&gt;This is a paper from graphics talking about how to implement an animation of objects, such as &lt;a href="http://www.pixar.com/shorts/ljr/theater/short_320.html" linkindex="290"&gt;luxo lamp&lt;/a&gt; from Pixar (see &lt;a href="https://secure.wikimedia.org/wikipedia/en/wiki/Luxo_Jr." linkindex="291"&gt;details of Lamp Jr.&lt;/a&gt; on wikipedia).&lt;br /&gt;&lt;br /&gt;First thing is about a spacetime particle. We get the ODE for the particle using Newton's law and typically we should solve the ODE using some numerical methods. We have some initial value constraints and also an objective function (minimum feul that exhausted in the procedure). This is actually a variational optimization problem. The numerical solution is obtained by discretization. The derivatives are approximated via finite differences. And then the objective function actually turns out to be a quadratic form. So we get a linear constrained quadratic optimization problem.&lt;br /&gt;&lt;br /&gt;For a complex object, the ODE is obtained via Lagrange dynamics (hmm... almost forget it). So actually we may solve a similar problem (but much more complicated, you have to analyse case by case).&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-354664883173906971?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/354664883173906971/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=354664883173906971' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/354664883173906971'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/354664883173906971'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/10/spacetime-constraints.html' title='Spacetime Constraints'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-2531508714128845550</id><published>2010-08-30T23:50:00.000+08:00</published><updated>2010-08-30T23:50:40.874+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='topic model'/><category scheme='http://www.blogger.com/atom/ns#' term='RBM'/><title type='text'>Replicated Softmax: an undirected topic model</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www.cs.toronto.edu/%7Ehinton/absps/repsoft.pdf"&gt;&lt;i&gt;Ruslan Salakhutdinov and Geoffrey Hinton&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper talks about an application of restricted Boltzmann machine. The visible layer contains a matrix of 0-1 variables (indicating whether the &lt;i&gt;k&lt;/i&gt;th sample takes value &lt;i&gt;i&lt;/i&gt;). And the hidden layer is again binary variables. After defining the conditional probability (since the conditional probability for visible layer is multinomial and therefore a softmax function on the hidden units, that's where the model's name comes from; replicated for the weight matrix is shared), we may use contrastive divergence to train the model. In topic model, the so-called log-perplexity (probability of observing testing samples) has to be computed so as to compare with other topic models (such as LDA). Annealed importance sampling is employed to compute the partition function.&lt;br /&gt;&lt;br /&gt;There is some link of this model with semantic hash previously developed by the authors. Experiments shows favorable result with undirected topic model.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-2531508714128845550?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/2531508714128845550/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=2531508714128845550' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2531508714128845550'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2531508714128845550'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/08/replicated-softmax-undirected-topic.html' title='Replicated Softmax: an undirected topic model'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-2983137446295457385</id><published>2010-08-25T00:39:00.000+08:00</published><updated>2010-08-25T00:39:28.395+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='visualization'/><category scheme='http://www.blogger.com/atom/ns#' term='deep belief network'/><title type='text'>Learning a Parametric Embedding by Preserving Local Structure</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://jmlr.csail.mit.edu/proceedings/papers/v5/maaten09a/maaten09a.pdf"&gt;&lt;i&gt;Laurens van der Maaten&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper proposed a parametric embedding for t-SNE. The mapping is parametrized via deep belief nets. The objective function of t-SNE then back propogates to the weights of several layers. The idea is simple but the implementation should be tedious.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-2983137446295457385?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/2983137446295457385/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=2983137446295457385' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2983137446295457385'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2983137446295457385'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/08/learning-parametric-embedding-by.html' title='Learning a Parametric Embedding by Preserving Local Structure'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-4508402138940751651</id><published>2010-08-25T00:35:00.000+08:00</published><updated>2010-08-25T00:35:48.268+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='visualization'/><title type='text'>Parametric Embedding for Class Visualization</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://web.mit.edu/cocosci/Papers/nips04pe.pdf"&gt;&lt;i&gt;Tomoharu Iwata, Kazumi Saito, Naonori Ueda, Sean Stromsten, Thomas Griffths and Joshua B. Tenenbaum&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper addresses the problem of visualizing posterior probabilities of topic models. We have scanned another paper on how to visualize the documents via a probabilistic graphical model. Actually this paper is the inspiration of the latter.&lt;br /&gt;&lt;br /&gt;By introducing an objective function resembling SNE, this paper solves an easier problem: the p table in SNE is now given, instead of computed via binary searches; the q table has a much smaller size.&lt;br /&gt;&lt;br /&gt;Actually the latter paper decouples the number of topics with the embeding dimensions directly (but coupled with the number of clusters). We may observe quite similar phenomenon in this model.&lt;br /&gt;&lt;br /&gt;As SNE, this model is also optimized with a gradient-descent method (alternatively, though).&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-4508402138940751651?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/4508402138940751651/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=4508402138940751651' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4508402138940751651'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4508402138940751651'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/08/parametric-embedding-for-class.html' title='Parametric Embedding for Class Visualization'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-6557586120975647056</id><published>2010-08-22T20:25:00.000+08:00</published><updated>2010-08-22T20:25:17.340+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='visualization'/><category scheme='http://www.blogger.com/atom/ns#' term='topic model'/><category scheme='http://www.blogger.com/atom/ns#' term='EM'/><category scheme='http://www.blogger.com/atom/ns#' term='bayesian framework'/><title type='text'>Probabilistic Latent Semantic Visualization: Topic Model for Visualizing Documents</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www.kecl.ntt.co.jp/as/members/ueda/pdf/2008_probabilistic.pdf"&gt;&lt;i&gt;Tomoharu Iwata, Takeshi Tamada and Naonori Ueda&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper proposed a model based on LDA. But the Dirichlet prior is replaced with the probability generated by the latent coordinates of each documents and the topics. The paper only deals with MAP estimation of the latent coordinates, which can be solved via EM-like algorithm.&lt;br /&gt;&lt;br /&gt;The learning is simple for the distribution of words conditioned on topics (analytic solution) while difficult for the latent coordinates due to the optimization (has to be solved via gradient-based numerical solutions).&lt;br /&gt;&lt;br /&gt;The idea is interesting though. Instead of seeking a representation of the documents in the topic space learn by LDA-like models, the visualization is directly modeled via a probabilistic graphical model.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-6557586120975647056?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/6557586120975647056/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=6557586120975647056' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6557586120975647056'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6557586120975647056'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/08/probabilistic-latent-semantic.html' title='Probabilistic Latent Semantic Visualization: Topic Model for Visualizing Documents'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-7681726360308272349</id><published>2010-08-22T20:17:00.001+08:00</published><updated>2010-08-22T20:17:35.853+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Gibbs sampler'/><category scheme='http://www.blogger.com/atom/ns#' term='probabilistic graphical model'/><category scheme='http://www.blogger.com/atom/ns#' term='topic model'/><category scheme='http://www.blogger.com/atom/ns#' term='bayesian framework'/><title type='text'>A Probabilistic Approach to Semantic Representation</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://cocosci.berkeley.edu/tom/papers/semrep.pdf"&gt;&lt;i&gt;Thomas L. Griffths and Mark Steyvers&lt;/i&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;This paper actually introduces a Gibbs sampler for LDA model. The Gibbs sampler, however, does not sample all latent variables. Only latent topics are sampled. I guess many DP models actually are making inferences in this way. But is this the so-called Bayesian way of learning? Sampling ?= learning. Well.... I don't know why they may do this. I have to see more...&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-7681726360308272349?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/7681726360308272349/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=7681726360308272349' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7681726360308272349'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7681726360308272349'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/08/probabilistic-approach-to-semantic.html' title='A Probabilistic Approach to Semantic Representation'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-4094811848654717121</id><published>2010-08-09T19:51:00.000+08:00</published><updated>2010-08-09T19:51:10.603+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='probabilistic graphical model'/><category scheme='http://www.blogger.com/atom/ns#' term='nonnegative constraints'/><title type='text'>Relation between PLSA and NMF and Implications</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;i&gt;&lt;a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.70.8839&amp;amp;rep=rep1&amp;amp;type=pdf"&gt;Eric Gaussier and Cyril Goutte&lt;/a&gt;&lt;/i&gt;&lt;/div&gt;&lt;br /&gt;This paper's critical point is PLSA solve the NMF problem with a KL divergence loss, which can be easily seen if you are familiar with the two algorithms. The implications are:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;The NMF algorithms can be interpreted as EM algorithms;&lt;/li&gt;&lt;li&gt;The NMF does not hold many advantages as pLSA.&lt;/li&gt;&lt;li&gt;But NMF might benefit with many loss functions for different purposes.&amp;nbsp; &lt;/li&gt;&lt;/ul&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-4094811848654717121?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/4094811848654717121/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=4094811848654717121' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4094811848654717121'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4094811848654717121'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/08/relation-between-plsa-and-nmf-and.html' title='Relation between PLSA and NMF and Implications'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5993715327388229562</id><published>2010-08-06T14:16:00.000+08:00</published><updated>2010-08-06T14:16:27.779+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='text'/><category scheme='http://www.blogger.com/atom/ns#' term='EM'/><category scheme='http://www.blogger.com/atom/ns#' term='frequentism'/><title type='text'>Probabilistic Latent Semantic Analysis</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;i&gt;&lt;a href="http://www.cs.brown.edu/%7Eth/papers/Hofmann-UAI99.pdf"&gt;Thomas Hofmann&lt;/a&gt;&lt;/i&gt;&lt;/div&gt;&lt;br /&gt;This paper introduces a probabilistic model for LSA problem. In traditional LSA, we have a word-document matrix (each column correspond to a document, each row denotes the count of a certain word). The LSA employs a SVD of the count matrix and indicates that the left singular vectors are latent topics. NMF might be more appropriate since the bases found are nonnegative and can be seen as distributions of words.&lt;br /&gt;&lt;br /&gt;This paper builds the first probabilistic model for the latent topics. The model is quite simple&lt;br /&gt;&lt;pre lang="eq.latex"&gt;\Pr(w, d) = \sum_z \Pr(z) \Pr(d\mid z) \Pr(w \mid z)&lt;/pre&gt;which can be trained with EM algorithm. The inference of this model is a bit awkward. But we may simply use &lt;code lang="eq.latex"&gt;\Pr(w \mid z)&lt;/code&gt; for inference problems.&lt;br /&gt;&lt;br /&gt;Later, LDA actually endow Dirichlet priors to the mixing proportions to the topics and words.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5993715327388229562?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5993715327388229562/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5993715327388229562' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5993715327388229562'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5993715327388229562'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/08/probabilistic-latent-semantic-analysis.html' title='Probabilistic Latent Semantic Analysis'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-2883219923571026705</id><published>2010-07-26T18:25:00.000+08:00</published><updated>2010-07-26T18:25:30.631+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='dimension reduction'/><category scheme='http://www.blogger.com/atom/ns#' term='fixed-point algorithm'/><title type='text'>Heavy-Tailed Symmetric Stochastic Neigbor Embedding</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;i&gt;&lt;a href="http://books.nips.cc/papers/files/nips22/NIPS2009_0664.pdf"&gt;Zhirong Yang, Irwin King, Zenglin Xu and Erkki Oja&lt;/a&gt;&lt;/i&gt;&lt;/div&gt;&lt;br /&gt;t-SNE is a very useful visualization algorithm, which inherits the idea from SNE but modifies the neighborhood probabilities to t-distributions instead of the original. This heavy-tailed distribution works pretty well on many data. This paper discusses a more general case.&lt;br /&gt;&lt;br /&gt;After transforming the original problem into its Lagrange dual, the authors get a unified algorithm (fixed-point iteration) for a family of heavy tailed distribution (including t-distribution). This is somewhat not as interesting as I have expected.&lt;br /&gt;&lt;br /&gt;The authors also discussed how to integrate supervised information into the SNE algorithm. But their idea is kind of rough: inserting a similarity computed from the supervised information into the similarity in the high-dimensional space. I don't know whether this would truly work...&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-2883219923571026705?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/2883219923571026705/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=2883219923571026705' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2883219923571026705'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2883219923571026705'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/07/heavy-tailed-symmetric-stochastic.html' title='Heavy-Tailed Symmetric Stochastic Neigbor Embedding'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-7292695982256832219</id><published>2010-07-26T17:24:00.000+08:00</published><updated>2010-07-26T17:24:42.295+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='feature extraction'/><title type='text'>Dirichlet Component Analysis: Feature Extraction for Compositional Data</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;i&gt;&lt;a href="http://icml2008.cs.helsinki.fi/papers/129.pdf"&gt;Hua-Yan Wang, Qiang Yang, Hong Qin and Hongbin Zha&lt;/a&gt;&lt;/i&gt;&lt;/div&gt;&lt;br /&gt;This paper discusses a problem of extracting features from compositional data (nonnegative features that sums up to 1). The problem is interesting though I am not sure about the major difficulties. To get a proper projection into lower dimensional space, we have to conform to a certain set of constraints (balanced rearrangement). A regularization operator is devised to preserve the Euclidean geometry. The rearrangement will shrink the data while regularization expands them.&lt;br /&gt;&lt;br /&gt;The optimization is quite strange (solved via genetic algorithm). The maximization w.r.t. &lt;code lang="eq.latex"&gt;\alpha&lt;/code&gt; looks like a MLE but the minimization w.r.t. the rearrangement matrix doesn't make much sense to me.&lt;br /&gt;&lt;br /&gt;I am still unsure about the application's intention.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-7292695982256832219?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/7292695982256832219/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=7292695982256832219' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7292695982256832219'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7292695982256832219'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/07/dirichlet-component-analysis-feature.html' title='Dirichlet Component Analysis: Feature Extraction for Compositional Data'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-3543452923699317749</id><published>2010-02-15T21:14:00.000+08:00</published><updated>2010-02-15T21:14:55.674+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='nonnegative constraints'/><category scheme='http://www.blogger.com/atom/ns#' term='gradient'/><title type='text'>Projected Gradient Methods for Nonnegative Matrix Factorization</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;i&gt;&lt;a href="http://www.csie.ntu.edu.tw/%7Ecjlin/papers/pgradnmf.pdf"&gt;Chih-Jen Lin&lt;/a&gt;&lt;/i&gt;&lt;/div&gt;&lt;br /&gt;This paper proposed an optimization technique for NMF problems, which I think could also be extended to more general Bregman divergence. The idea is based on optimization. It is pointed out that most algorithms can converge to local stationary points not local minima. The multiplicative updating rule only ensures the convergence, whether it be stationary point (let alone local minima) or not. The proposed algorithm is quite simple, merely eliminating those coords out of the feasible domain (i.e. set those negative values to 0). The result looks better than the multiplicative updating rule.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-3543452923699317749?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/3543452923699317749/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=3543452923699317749' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3543452923699317749'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3543452923699317749'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/02/projected-gradient-methods-for.html' title='Projected Gradient Methods for Nonnegative Matrix Factorization'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-6090650420601242291</id><published>2010-02-12T22:53:00.004+08:00</published><updated>2010-02-15T20:14:13.154+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Bregman divergence'/><category scheme='http://www.blogger.com/atom/ns#' term='nonnegative constraints'/><title type='text'>Generalized Nonnegative Matrix Approximation with Bregmen Divergence</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;i&gt;&lt;a href="http://books.nips.cc/papers/files/nips18/NIPS2005_0203.pdf"&gt;Inderjit S. Dhillon and Suvrit Sra&lt;/a&gt;&lt;/i&gt;&lt;/div&gt;&lt;br /&gt;This paper introduces a family of objective in the form of Bregman divergence. From &lt;a href="http://en.wikipedia.org/wiki/Bregman_divergence"&gt;this page&lt;/a&gt;, you may see how broad the family is. For any given continuously differentiable and strictly convex function &lt;code lang="eq.latex"&gt;f(\cdot)&lt;/code&gt;, the Bregman divergence is defined as &lt;br /&gt;&lt;pre lang="eq.latex"&gt;B_f(p \parallel q) = f(p) - f(q) - \langle \nabla f(q), p - q\rangle&lt;/pre&gt;which can be regarded as the difference of function value change and a linear approximation of the change. There are many interesting properties. Please check it out. This paper uses the Bregman divergence to measure the goodness of NMF, which naturally extends the original NMF objectives in &lt;a href="http://paperscanner.blogspot.com/2010/02/algorithms-for-nonnegative-matrix.html"&gt;the previous paper scanned here&lt;/a&gt;. Please note that both &lt;code lang="eq.latex"&gt;B_\phi(x_i, Wv_i)&lt;/code&gt; and &lt;code lang="eq.latex"&gt;B_\phi(Wv_i, x_i)&lt;/code&gt; can be employed as the objective function, but the former one requires less computational work.&lt;br /&gt;&lt;br /&gt;The most surprising fact is that the auxiliary function used to formulate the multiplicative updating rule can also be extended into the more general Bregman divergence case if the auxiliary function satisfies the same constraints&lt;br /&gt;&lt;pre lang="eq.latex"&gt;G(x,x) = f(x), \qquad G(x, y) \geq f(x), \forall y.&lt;/pre&gt;The authors provides the following &lt;code lang="eq.latex"&gt;f(\cdot)&lt;/code&gt;,&lt;br /&gt;&lt;pre lang="eq.latex"&gt;f(v) = B_\phi(Wv, x)&lt;/pre&gt;given the approximation problem &lt;br /&gt;&lt;pre lang="eq.latex"&gt;X = (x_1 ,\ldots, x_N) \approx W V = W(v_1, \ldots, v_N).&lt;/pre&gt;The function &lt;code lang="eq.latex"&gt;G(\cdot, \cdot)&lt;/code&gt; is a little well-designed to simulate the original auxiliary function,&lt;br /&gt;&lt;pre lang="eq.latex"&gt;G(v, u) = \sum_{i, j} \lambda_{i, j} f\left( \frac{W_{i, j} v_j}{\lambda_{i, j}} \right) - \left( \sum_i f(x_i) + f'(x_i)(W v_i - x_i)\right)&lt;/pre&gt;They also derived another multiplicative rule with KKT condition.&lt;br /&gt;&lt;br /&gt;This result is really interesting usage of Bregman distance.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-6090650420601242291?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/6090650420601242291/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=6090650420601242291' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6090650420601242291'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6090650420601242291'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/02/generalized-nonnegative-matrix.html' title='Generalized Nonnegative Matrix Approximation with Bregmen Divergence'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-8955083284618082237</id><published>2010-02-12T22:40:00.000+08:00</published><updated>2010-02-12T22:40:41.696+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='nonnegative constraints'/><title type='text'>Algorithms for Nonnegative Matrix Factorization</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;i&gt;&lt;a href="http://hebb.mit.edu/people/seung/papers/nmfconverge.pdf"&gt;Daniel D. Lee and H. Sebastian Seung&lt;/a&gt;&lt;/i&gt;&lt;/div&gt;&lt;br /&gt;This paper follows their Nature paper. They proposed two objectives for the decomposition. Their algorithm also relies on the "auxiliary function". The updating rule is kind of interesting due to their multiplicative property. This is later generalized to Bregman divergence in a NIPS05 paper.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-8955083284618082237?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/8955083284618082237/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=8955083284618082237' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8955083284618082237'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8955083284618082237'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/02/algorithms-for-nonnegative-matrix.html' title='Algorithms for Nonnegative Matrix Factorization'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-161825247236186000</id><published>2010-01-31T10:22:00.000+08:00</published><updated>2010-01-31T10:22:40.714+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='semi-supervised learning'/><category scheme='http://www.blogger.com/atom/ns#' term='label propagation'/><title type='text'>Non-metric Label Propagation</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;i&gt;&lt;a href="http://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/ijcai09a.pdf"&gt;Yin Zhang and Zhi-hua Zhou&lt;/a&gt;&lt;/i&gt;&lt;/div&gt;&lt;br /&gt;This paper follows the idea of the non-metric similarity matrix analysis. By decomposing the Gram matrix into two separate graphs (one for positive eigenvalues and the other for the negative), they build two separate Markov chains, which compromise a mixture of Markov model for label propagation (just an explicit solution of linear equations). Their paper contains many experiments as usual, which I think might be the deficit of my own research work.&lt;br /&gt;&lt;br /&gt;The idea is not that fancy but the application in label propagation might be novel, the research style of Zhou's :-p That requires keen olfaction.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-161825247236186000?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/161825247236186000/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=161825247236186000' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/161825247236186000'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/161825247236186000'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/01/non-metric-label-propagation.html' title='Non-metric Label Propagation'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-1338351710484214168</id><published>2010-01-31T10:05:00.000+08:00</published><updated>2010-01-31T10:05:02.092+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='similarity'/><category scheme='http://www.blogger.com/atom/ns#' term='non-metric methods'/><title type='text'>Feature Discovery in Non-metric Pairwise Data</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;i&gt;&lt;a href="http://jmlr.csail.mit.edu/papers/volume5/laub04a/laub04a.pdf"&gt;Julian Laub Klaus-Robert Muller&lt;/a&gt;&lt;/i&gt;&lt;/div&gt;&lt;br /&gt;This is a paper about how to analysis pairwise "distance" or similarity matrices. Since no all similarity matrices can be transformed into a Gram matrix (as we do in MDS), it is interesting to take a deeper insight into the details.&lt;br /&gt;&lt;br /&gt;Basically, we may imagine there are two metrics, one for similarity and another for dissimilarity (penalizing the similarity in human perception). By applying a spetral transformation, we may use metric methods if the spectra can be fixed (no negative).&lt;br /&gt;&lt;br /&gt;The problem is how we may utilize the negative part of the spectra.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-1338351710484214168?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/1338351710484214168/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=1338351710484214168' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/1338351710484214168'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/1338351710484214168'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2010/01/feature-discovery-in-non-metric.html' title='Feature Discovery in Non-metric Pairwise Data'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-2749920620603121127</id><published>2009-11-23T14:34:00.005+08:00</published><updated>2009-11-23T15:35:19.442+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='semi-supervised learning'/><title type='text'>Two-view Feature Generation Model for Semi-supervised Learning</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://riejohnson.com/rie/AZ_unlabeled_cr1.pdf"&gt;Rie Kubota Ando and Tong Zhang&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;We first take a look at their logic: for semi-supervised learning, a generative model is usually preferred since unlabeled data help estimate the margin distribution &lt;code lang="eq.latex"&gt;\Pr(x)&lt;/code&gt;. In a Bayesian MAP formulation, we are actually&lt;pre lang="eq.latex"&gt;\min_\alpha - \sum_i \log \Pr(y_i \mid \alpha, x_i) - \log \Pr(x_u \mid \alpha)\Pr(\alpha)&lt;/pre&gt;which is a little different from a direct generative model. Here the first term is actually a discriminative term and the second term is a penalty from unlabeled part (therefore it is more similar to a ``supervised loss + penalty'' model). This paper does talk about models of the latter, using auxiliary problems.&lt;br /&gt;&lt;br /&gt;The two-view model means, analogous to co-training, we have two view of feature vector &lt;code lang="eq.latex"&gt;x&lt;/code&gt;, namely &lt;code lang="eq.latex"&gt;z_1(x), z_2(x)&lt;/code&gt;, which are inpdependent conditioned on the label. The different thing about this model is in order to solve &lt;code lang="eq.latex"&gt;\Pr(y \mid z_1, z_2)&lt;/code&gt;, we need &lt;code lang="eq.latex"&gt;\Pr(y \mid z_1), \Pr(y \mid z_2)&lt;/code&gt;. Now we only consider &lt;code lang="eq.latex"&gt;\Pr(y \mid z_1)&lt;/code&gt;. One possibility is to make a low-rank decomposition of &lt;code lang="eq.latex"&gt;\Pr(z_2 \mid z_1) = \sum_y \Pr(z_2 \mid y) \Pr(y \mid z_1)&lt;/code&gt; but the LHS is sometimes impossible to compute. An approximation is to encode &lt;code lang="eq.latex"&gt;z_2&lt;/code&gt; with a set of binary labels &lt;code lang="eq.latex"&gt;t_1^k(z_2)&lt;/code&gt;. Then &lt;code lang="eq.latex"&gt;\Pr( t_1^k \mid z_1) = \sum_y \Pr(t_1^k \mid y) \Pr(y \mid z_1)&lt;/code&gt; can be computed. By increasing the number of related binary labels &lt;code lang="eq.latex"&gt;t_1^k&lt;/code&gt; we may have a good estimation of &lt;code lang="eq.latex"&gt;\Pr(y \mid z_1)&lt;/code&gt;.&lt;br /&gt;&lt;br /&gt;They proposed two models (one linear and the other log-linear, which resembles linear regression and logistic regression in a way). The linear version coincides with the SVD-ASO model in &lt;a href="http://paperscanner.blogspot.com/2009/11/framework-for-learning-predicative.html"&gt;their JMLR paper&lt;/a&gt;. The log-linear model is solved via EM-like algorithm.&lt;br /&gt;&lt;br /&gt;The thing is what kind of binary auxilliary function would be essential to our semi-supervised problems? This might be a key to understanding their JMLR paper for multi-task learning.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-2749920620603121127?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/2749920620603121127/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=2749920620603121127' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2749920620603121127'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2749920620603121127'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/11/two-view-feature-generation-model-for.html' title='Two-view Feature Generation Model for Semi-supervised Learning'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-4536521232364978603</id><published>2009-11-23T14:25:00.005+08:00</published><updated>2009-11-23T15:36:09.089+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='semi-supervised learning'/><category scheme='http://www.blogger.com/atom/ns#' term='multi-task learning'/><title type='text'>A Framework for Learning Predicative Structures from Multiple Tasks and Unlabeled Data</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://jmlr.csail.mit.edu/papers/volume6/ando05a/ando05a.pdf"&gt;Rie Kubota Ando and Tong Zhang&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper addresses a framework for multi-task learning. Their idea is quite simple. There is a common factor &lt;code lang="eq.latex"&gt;\Theta&lt;/code&gt; which is shared in different but related problems. Therefore in each problem &lt;code lang="eq.latex"&gt;P_k&lt;/code&gt;, our parameters include &lt;code lang="eq.latex"&gt;w_i&lt;/code&gt;, which is problem-specific and &lt;code lang="eq.latex"&gt;v_i&lt;/code&gt; which is dependent on the common feature controled by &lt;code lang="eq.latex"&gt;\Theta&lt;/code&gt;. To solve the model it usually desirable to alternatively optimize over &lt;code lang="eq.latex"&gt;w_i, v_i&lt;/code&gt; and &lt;code lang="eq.latex"&gt;\Theta&lt;/code&gt;. Usually a regularizer is also included for better generalization capacity.&lt;br /&gt;&lt;br /&gt;using this idea, the authors proposed a linear model which is solved by the ASO using SVD in each iteration to find &lt;code lang="eq.latex"&gt;\Theta&lt;/code&gt; (SVD-ASO in their term). With this idea, they analyzed the semi-supervised learning with auxiliary functions, which are essentially those multi-tasks.&lt;br /&gt;&lt;br /&gt;Their extention for this piece of work is scanned &lt;a href="http://paperscanner.blogspot.com/2009/11/two-view-feature-generation-model-for.html"&gt;here&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-4536521232364978603?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/4536521232364978603/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=4536521232364978603' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4536521232364978603'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4536521232364978603'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/11/framework-for-learning-predicative.html' title='A Framework for Learning Predicative Structures from Multiple Tasks and Unlabeled Data'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-635537376501990492</id><published>2009-11-21T14:22:00.003+08:00</published><updated>2009-11-21T16:19:40.011+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='multiple-kernel learning'/><category scheme='http://www.blogger.com/atom/ns#' term='semi-supervised learning'/><category scheme='http://www.blogger.com/atom/ns#' term='feature selection'/><title type='text'>Discriminative Semi-supervised Feature Selection via Manifold Regularization</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://ijcai.org/papers09/Papers/IJCAI09-219.pdf"&gt;Zenglin Xu, Rong Jin, Michael R. Lyu and Irwin King&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper talks about feature selection via SVM. The semi-supervised part is enabled by adding a manifold regularizer. The method is to multiply the feature with a diagonal 0-1 matrix (selecting features). With these variables in the optimization as well, we get the optimization for this problem. The key idea to solve this problem is to reformulate it with the dual of SVM but leaving the feature selecting variables alone. Then the optima is the saddle point of the optimization problem. This kind of problem can be found in multiple-kernel learning, which has a standard algorithm (alternating optimization w.r.t. difference variables).&lt;br /&gt;&lt;br /&gt;The idea of using SVM for feature selection is not new. The contribution might be the semi-supervised setting. In my own research it seems that we still do not have a clear goal of achieving this with other methods. hmm...&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-635537376501990492?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/635537376501990492/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=635537376501990492' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/635537376501990492'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/635537376501990492'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/11/discriminative-semi-supervised-feature.html' title='Discriminative Semi-supervised Feature Selection via Manifold Regularization'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-4162275172058205509</id><published>2009-11-12T01:11:00.002+08:00</published><updated>2009-11-12T01:13:56.592+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='computational photography'/><title type='text'>Dappled Photography: Mask Enhanced Cameras for Heterodyned Light Fields and Coded Aperture Refocusing</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www.umiacs.umd.edu/%7Eaagrawal/sig07/MatlabCodeImages.html"&gt;&lt;span style="font-style: italic;"&gt;A. Veeraraghavan, R. Raskar, A. Agrawal, A. Mohan and J. Tumblin&lt;/span&gt;&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;A light field conveys both spatial and angular distribution of light incident on the camera sensor. The pioneer work to capture a light field in one photographic exposure is the plenoptic camera, a device that uses a microlens array to rearrange a 4D light field and capture it with a 2D sensor. However, the optics of the microlens array defines a fixed resolution tradeoff between spatial and angular sampling of the light field.&lt;br /&gt;In this paper, the authors propose to modulate the light field by shadowing the incoming light with a mask in the optical path. In the Fourier Light Field Space (FLS), the mask creates a train of identical kernels positioned in a slanted slice, and thereby, via convolution, pulls high angular frequencies to the central angular slice, the only slice the camera measures in the FLS. Assuming that the incident light field is band limited, the captured image is the flattened version of the incident light field in the Fourier domain.&lt;br /&gt;Moreover, the slant of the mask kernel, which decides the spatial-angular resolution tradeoff, is determined by the location of the mask. Consequently, the resolution tradeoff can be adjusted by translating the mask. The minimal and the maximal angular resolution are achieved by placing the mask at the aperture and at the conjugate plane respectively.&lt;br /&gt;However, the mask enhanced camera seems to trade reconstruction quality for flexibility. First, the optimal pattern of the mask varies with its location yet in practice the mask pattern is permanent. Second, the mask blocks about half of the incident light and reduces the signal-to-noise ratio of the sensed image. After all, the paper provides a profound analysis on the principle of mask enhanced cameras, a major category of computational camera, making itself influential in the Computation Photography community.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-4162275172058205509?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/4162275172058205509/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=4162275172058205509' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4162275172058205509'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4162275172058205509'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/11/dappled-photography-mask-enhanced.html' title='Dappled Photography: Mask Enhanced Cameras for Heterodyned Light Fields and Coded Aperture Refocusing'/><author><name>tangtang</name><uri>http://www.blogger.com/profile/02817027709059823182</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-7796999047826690236</id><published>2009-11-12T01:08:00.001+08:00</published><updated>2009-11-12T01:10:45.789+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='computational photography'/><title type='text'>4D Frequency Analysis of Computational Cameras for Depth of Field Extension</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www.wisdom.weizmann.ac.il/%7Elevina/papers/lattice/"&gt;&lt;span style="font-style: italic;"&gt;A. Levin, S. W. Hasinoff, P. Green, F. Durand and W. T. Freeman&lt;/span&gt;&lt;/a&gt;&lt;br /&gt;&lt;div style="text-align: left;"&gt;    Although many types of cameras are invented to extend their depth of field (DoF), none of them optimize the quality of the resulting image or, equivalently, maximize the modulation transfer function (MTF). In this paper, the authors perform a 4D frequency analysis to estimate the maximal frequency spectrums of optical systems.&lt;br /&gt;    The key of the analysis lies in the observation of the dimensional gap between the 3D MTF and the 4D ambiguity function that characterizes a camera: the former was a manifold embedded in the latter, called “the focal segments”. To maximize the MTF, therefore, the ambiguity function is desired to uniformly distribute all the energy on these segments. This analysis leads to an upper bound of the MTF.&lt;br /&gt;    Unfortunately, most contemporary computational cameras waste energy out of the region. The only exception is the focal sweep camera, but the phase incongruence of its OTF across various focus settings lowers the spectrum magnitude. The authors propose the lattice-focal lens. This lens is composed of a number of sub-squares, each responsible for focusing light rays from a specific depth. This spatial division of aperture also concentrates energy on the focal region, but achieves a much higher spectrum than the focal sweep camera.&lt;br /&gt;    The ambiguity function, defined as auto-correlation of the 2D scalar field of an optical system, is a redundant representation. This prohibits the authors from determining the tight upper bound of the frequency spectrum. Still, the proposed analysis sheds much light on this question. Although there is no explicit analysis, it indicates that the key of maximizing MTFs may lie in phase incoherence of the optical system.&lt;br /&gt;&lt;br /&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-7796999047826690236?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/7796999047826690236/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=7796999047826690236' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7796999047826690236'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7796999047826690236'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/11/4d-frequency-analysis-of-computational.html' title='4D Frequency Analysis of Computational Cameras for Depth of Field Extension'/><author><name>tangtang</name><uri>http://www.blogger.com/profile/02817027709059823182</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-1316772197292966330</id><published>2009-11-12T01:05:00.000+08:00</published><updated>2009-11-12T01:07:59.970+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='computational photography'/><title type='text'>Flexible Depth of Field Photography</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www1.cs.columbia.edu/CAVE/projects/flexible_dof/"&gt;H. Nagahara, S. Kuthirummal, C. Zhou, and S. Nayar&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;    The depth of field (DoF) of a lens is confined to a frontal-parallel slab. In this paper, the authors attempt to break this limitation by proposing flexible depth of field photography, i.e. to translate the detectors within the shutter. Thus at each moment the lens produces a point spread function (PSF) associated with the sensor position, and the final PSF, called integrated PSF (IPSF) is the sum of all the PSFs produced over the exposure.&lt;br /&gt;&lt;br /&gt;       Under this scheme, the authors suggested three applications for manipulating DoF. In the first application, the sensors are translated uniformly to produce a depth-independent, frequency preserving IPSF, ensuring a good quality of the restored all-in-focus image. In the second application, the authors play with non-uniform translations. Unwanted depth layers in the middle can be skipped so that their image would be blurry enough to be unnoticeable. The authors also showed that by rolling the detector’s exposure time during the translation, an arbitrary shape of DoF can be produced. All the applications are realized with a prototype flexible DoF imaging system built by the author.&lt;br /&gt;&lt;br /&gt;       The idea of flexible DoF is interesting, but it is not as novel as the authors claim in the paper. Although never thoroughly investigated, focus sweep is a widely used technique in photography under the name variable-focus photography; uniform sweeping of the DoF was even proposed as early as 1972 by Hausler. The major difference between Hausler’s work and this paper is just replacing focus sweeping with sensor translation, which is minor. The usefulness of the other two applications are somewhat vague due to the underlying assumption that scene depth is known before capture.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-1316772197292966330?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/1316772197292966330/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=1316772197292966330' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/1316772197292966330'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/1316772197292966330'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/11/flexible-depth-of-field-photography.html' title='Flexible Depth of Field Photography'/><author><name>tangtang</name><uri>http://www.blogger.com/profile/02817027709059823182</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-7990109999493558437</id><published>2009-11-12T00:59:00.002+08:00</published><updated>2009-11-12T01:03:40.805+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='computational photography'/><title type='text'>Time-Constrained Photography</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.toronto.edu/%7Ehasinoff/timecon/"&gt;S. W. Hasinoff, K. N. Kutulakos, F. Durand, and W. T. Freeman&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;Current designs and evaluations of depth-of-field (DoF) extended cameras all assume a single photo is captured, yet within the same exposure time, a focal stack shot may give rise to a less noisy restoration. Therefore, the authors propose to evaluate the performance of camera designs in the multiple-shots scenario.&lt;br /&gt;&lt;br /&gt;The authors firstly develop a Bayesian approach to restore the depth map and the all-in-focus image of a scene; they also estimate the scene-independent restoration error as a function of the interested range of depth and the schedule of capture. Fixing the total exposure time, they seek the optimal number of photos to capture: a few more photos would increase the signal-to-noise ratio (SNR) of the restored image because the dominant source of image noise, the photon noise, is multiplicative; the other source, the additive read noise, would penalize excessive photos though.&lt;br /&gt;&lt;br /&gt;According to the expected restoration error, the authors compared the performance of various camera designs. Surprisingly, the conventional camera performs as well as, if not better than, other types of cameras in all but the low time budget settings; it also benefits most from the multiple-shot scheme. In contrast, the coded aperture camera performs poorly in term of light efficiency because its mask blocks most of the light from the sensor.&lt;br /&gt;&lt;br /&gt;This paper constitutes a strong defense against intensity masking techniques by alluding the crucial role of time budgets in DoF extension. Indeed, there is no point in designing a DoF extended camera without a time constraint: closing the aperture is most convenient. For the same reason, albeit computational cameras do not outperform conventional ones in most occasions, they serve the most demanding purpose: to capture a sharp image with the minimal exposure.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-7990109999493558437?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/7990109999493558437/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=7990109999493558437' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7990109999493558437'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7990109999493558437'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/11/time-constrained-photography.html' title='Time-Constrained Photography'/><author><name>tangtang</name><uri>http://www.blogger.com/profile/02817027709059823182</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-8819822552891942197</id><published>2009-11-12T00:49:00.003+08:00</published><updated>2009-11-12T00:53:32.421+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='computational photography'/><title type='text'>Fourier Slice Photography</title><content type='html'>&lt;div style="text-align: right;"&gt;&lt;i&gt;by &lt;a href="http://graphics.stanford.edu/papers/fourierphoto/"&gt;Ren Ng&lt;/a&gt;&lt;br /&gt;&lt;/i&gt;&lt;div style="text-align: left;"&gt;&lt;p class="MsoNormal"&gt;&lt;span style="font-family:Georgia;"&gt;&lt;span style="font-size:100%;"&gt;Unlike the 2D definition of image, light field more expressively measures the light radiance along all possible 4D rays. This notion was initially suggested to synthesize new view of non-Lambertian scenes by ray tracing, yet in this paper, the author is interested in employing this tool to model photographic imaging.&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;  &lt;p class="MsoNormal"&gt;&lt;span style=";font-family:Georgia;color:black;"  &gt;&lt;span style="font-size:100%;"&gt;The core of this work was the Fourier Slice Photography Theorem, developed from a generalization of Bracewell’s Fourier Slice Theorem. The latter addresses that the Fourier Transform (FT) of a signal’s integral projection corresponds to a sheared slice in its FT. As the imaging process can be seen as a sheared integral of the light field, in the Fourier domain, a photograph formed with full aperture corresponds to a 2D slice in the 4D light field. This discovery speeds up digital refocusing from O(n&lt;sup&gt;4&lt;/sup&gt;) to O(n&lt;sup&gt;2&lt;/sup&gt;logn) and improves precision because Fourier Transform has a fast implementation and avoids the numerical loss in integral. The advantage of the new algorithm was also validated by experiments.&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;  &lt;p class="MsoNormal"&gt;&lt;span style=";font-family:Georgia;color:black;"  &gt;&lt;span style="font-size:100%;"&gt;To show the utility of the theorem, the author goes further to provide in close-form the performance limit of plenoptic cameras of finite aperture. Preliminarily quantitative analysis was only possible for the simplest pinhole model as the assumption of aperture complicates the problem by low-passing the light field. The author tackles this difficulty by working in the Fourier domain. Firstly he shows that the 4D convolution of the light field yields 2D convolution of photographs focused at a variety of depths. Applying the Fourier Slice Photography Theorem to this statement, he concludes that a band-limit assumption on plenoptic cameras would degrade its performance in digital refocusing, and the amount of degradation increases linearly with the directional resolution of the sampled light field.&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;  &lt;p class="MsoNormal"&gt;&lt;span style=";font-family:Georgia;color:black;"  &gt;&lt;span style="font-size:100%;"&gt;This paper gives an in-depth insight to the relationship between a light field and its photographic images, and pioneers theoretical analysis of optical designs in the Computational Photography community. It also inspires the invention of new light field cameras, e.g. Raskar’s design of dappled photography. Although the author only focused on aberration-free lens models, the Theorem of Fourier Slice Photography can be applied to a broader range of optical systems.&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-8819822552891942197?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/8819822552891942197/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=8819822552891942197' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8819822552891942197'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8819822552891942197'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/11/fourier-slice-photography.html' title='Fourier Slice Photography'/><author><name>tangtang</name><uri>http://www.blogger.com/profile/02817027709059823182</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5696202535927241658</id><published>2009-11-12T00:39:00.003+08:00</published><updated>2009-11-12T00:48:44.211+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='computational photography'/><title type='text'>Extended Depth of Field through Wavefront Coding</title><content type='html'>&lt;div style="text-align: right;"&gt;&lt;i&gt;by &lt;a href="http://www.colorado.edu/isl/papers/edf/paper.html"&gt;E. Dowski and W. Cathey&lt;/a&gt;&lt;br /&gt;&lt;/i&gt;&lt;div style="text-align: left;"&gt;&lt;p align="JUSTIFY"&gt; The authors worked on extending the depth of field (DoF) of optical systems, the range of distance from which objects can be imaged in full detail. Beyond this range, the image undergoes blurred, mathematically modeled as a convolution between the in-focus image and a point spread function (PSF) associated with the distance of the object. In the more general definition, an object is also considered to be within the DoF if its image can be ideally restored from what is read from the sensor.&lt;/p&gt; &lt;p align="JUSTIFY"&gt; Previously the mainstream approach to DoF extension is to block light at the aperture with an apodizer, but the optical power at the sensor is also decreased, resulting in a much longer exposure. The only prior approach with full aperture was Hausler's focus sweep method. However, it is limited in application due to the requirement to continuously change the focus setting during exposure.&lt;/p&gt; &lt;p align="JUSTIFY"&gt; The authors proposed to attach a phase mask to the optical system to achieve a PSF that is invariant to misfocus and beneficial to recovery of the full-resolution image, in the sense that its optical transfer function (OTF) has large values within its passband. Thus, the in-focus image  could be fully recovered from the sensed image without knowledge about depth of the object.&lt;/p&gt; &lt;p align="JUSTIFY"&gt; To compute the profile of the phase mask, the authors firstly computed the OTF as a function of depth and phase profile of the mask. The OTF at a specific distance was proved to be a slice of the ambiguity function across the center, and the slope of the slice corresponds to the amount of misfocus. Therefore, the PSF produced by an optical system is constant to focal distance only when its corresponding ambiguity function is rotationally invariant over the angular region that corresponds to the extended DoF.&lt;/p&gt; &lt;p align="JUSTIFY"&gt; Limiting the profile of the phase mask to be monomial, the authors further derived that it has to be in cubic form. The bandwidth of cubic phase masked(cubic-pm) systems was also analyzed as a function of the monomial coefficient to guarantee that the OTF would not have zeros in its passband.&lt;/p&gt; &lt;p align="JUSTIFY"&gt; In experiments the authors compared the half-maximum amplitude and Fisher information of the cubic-pm PSFs to standard ones. Although the cubic-pm design greatly outperforms, this comparison may be unfair to the standard one because insensitivity to focus change is not indispensable to DoF extension. Nevertheless, the above experiments did suggest that cubic-pm design avoids the obstacle of PSF identification.&lt;/p&gt; &lt;p align="JUSTIFY"&gt; A comparison of restored images was also performed by simulation under noise-free assumption. Again, the cubic-pm optics appeared superior to the standard one. The key to its success lies in a wider support of the OTF.  Acknowledging that noise is unavoidable, the authors estimated the signal-to-noise ratio of cubic-pm systems to be more than 20dB. Still, the experiments could have been more persuading had the authors included results in real optical systems to account for not only noise, but also manufactural imprecision of the phase mask.&lt;/p&gt; &lt;p align="JUSTIFY"&gt; In summary, this paper presents a novel solution to DoF extension. Although it was published 15 years ago, its influence in Computational Photography (CP) is still significant. On one hand, the need to maximize the amount of light at the sensor is increasingly emphasized in the area and phase masking remains an unique solution to extended DoF under this constraint till now. On the other hand, this paper highlights the importance of ambiguity function, which has recently been found to be the bridge between light field theory in CP and wavefront optics.&lt;/p&gt;&lt;/div&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5696202535927241658?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5696202535927241658/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5696202535927241658' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5696202535927241658'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5696202535927241658'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/11/extended-depth-of-field-through.html' title='Extended Depth of Field through Wavefront Coding'/><author><name>tangtang</name><uri>http://www.blogger.com/profile/02817027709059823182</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5101960249896468933</id><published>2009-10-03T20:44:00.003+08:00</published><updated>2009-10-03T21:09:06.188+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='maximum margin'/><category scheme='http://www.blogger.com/atom/ns#' term='SDP'/><category scheme='http://www.blogger.com/atom/ns#' term='clustering'/><category scheme='http://www.blogger.com/atom/ns#' term='SVM'/><title type='text'>Maximum Margin Clustering</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.uwaterloo.ca/%7El5xu/papers/nips04.pdf"&gt;Linli Xu, James Neufeld, Bryce Larson and Dale Schuurmas&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper is interesting in formulating a clustering problem as a convex optimization problem. Its framework is not brand new but the idea is very enlightening for a current project. referring to maximum margin method, everyone knows how SVM is modelled with a quadratic programming problem with linear constraints. In clustering, we are not interested in the parametrized separation boundary but in the partition of samples that maximizes the boundary. Then the problem turns out to be an integer optimization problem, which is often NP-hard.&lt;br /&gt;&lt;br /&gt;The key idea is to use the out product, e.g. the kernel matrix of the assignment matrix&lt;code lang="eq.latex"&gt;M = y y^\top&lt;/code&gt; in the dual formulation of soft-margin SVM. And another interesting finding is that instead of using non-convex constraint &lt;code lang="eq.latex"&gt;\mathrm{rank}(M) = 1&lt;/code&gt;, they use three linear constraints:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;&lt;code lang="eq.latex"&gt;M&lt;/code&gt; encodes equivalent class information, i.e. transitive &lt;code lang="eq.latex"&gt;M_{i, k} \geq M_{i, j} + M_{j, k} - 1&lt;/code&gt;, reflexive &lt;code lang="eq.latex"&gt;M_{i,i} = 1&lt;/code&gt; and symmetric &lt;code lang="eq.latex"&gt;M_{i, j} = M_{j, i}&lt;/code&gt;;&lt;/li&gt;&lt;li&gt;&lt;code lang="eq.latex"&gt;M&lt;/code&gt; has at most 2 equivalent classes, i.e. &lt;code lang="eq.latex"&gt;M_{j, k} \geq - M_{i, j} - M_{i, k} - 1&lt;/code&gt;;&lt;/li&gt;&lt;li&gt;&lt;code lang="eq.latex"&gt;M&lt;/code&gt; has at least 2 equivalent classes, &lt;code lang="eq.latex"&gt;\sum_i M_{i, j} \leq N - 2&lt;/code&gt;.&lt;/li&gt;&lt;/ul&gt;&lt;br /&gt;Then the problem becomes a SDP. It might not be easy to extend the idea to multi-clustering.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5101960249896468933?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5101960249896468933/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5101960249896468933' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5101960249896468933'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5101960249896468933'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/10/maximum-margin-clustering.html' title='Maximum Margin Clustering'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-4973005879467266321</id><published>2009-09-05T21:00:00.004+08:00</published><updated>2009-09-05T21:26:23.065+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='consistency'/><category scheme='http://www.blogger.com/atom/ns#' term='SVM'/><category scheme='http://www.blogger.com/atom/ns#' term='kernel'/><title type='text'>On the Influence of the Kernel on the Consistency of Support Vector Machines</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://jmlr.csail.mit.edu/papers/volume2/steinwart01a/steinwart01a.pdf"&gt;Ingo Stainwart&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This is kind of math paper. I haven't really delved into some mathematical stuffs for a long time.This paper might be the first to explore the consistency of SVM (that is the asymptotic behavior of the classifier compared with the optimal, Bayes decision error). The main result might be as follows:&lt;br /&gt;For universal kernels we have consistency result for L2 and L1 soft margin classifiers.&lt;br /&gt;The universal kernels are those whose induced RKHS is dense in continuous function space. Gaussian and Laplacian kernels are both universal. The author derived the corresponding consistency.&lt;br /&gt;At first glance I thought they find some tricks in choosing regularization parameters. But I didn't find anything truely usable (e.g. corollary 18).&lt;br /&gt;&lt;br /&gt;The editor is Scholkopf... I guess only huge bulls like him understands the key points. For now I just have quite a faint idea.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-4973005879467266321?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/4973005879467266321/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=4973005879467266321' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4973005879467266321'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4973005879467266321'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/09/on-influence-of-kernel-on-consistency.html' title='On the Influence of the Kernel on the Consistency of Support Vector Machines'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-4879438012506592967</id><published>2009-07-29T20:41:00.002+08:00</published><updated>2009-07-29T20:56:37.545+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='optimization'/><title type='text'>Efficient Euclidean Projection in Linear Time</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/123.pdf"&gt;Jun Liu and Jieping Ye&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;I am really surprised why this paper got published. &lt;a href="http://paperscanner.blogspot.com/2009/07/efficient-projections-onto-l1-ball-for.html"&gt;The previously scanned paper &lt;/a&gt;in ICML 2008 noticed the linear algorithm already. Not sure...&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-4879438012506592967?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/4879438012506592967/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=4879438012506592967' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4879438012506592967'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4879438012506592967'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/efficient-euclidean-projection-in.html' title='Efficient Euclidean Projection in Linear Time'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-3058241698189766978</id><published>2009-07-29T20:09:00.002+08:00</published><updated>2009-07-29T20:40:54.814+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='kernel'/><title type='text'>On Sampling-based Approximate Spectral Decomposition</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/256.pdf"&gt;Sanjiv Kumar, Mehryar Mohri and Ameet Talwalkar&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper proposed another approximation of kernel methods based on former Nystrom method and column sampling. The prime disadvantage of the earlier method is that they must compute the whole Gram matrix while the proposed adaptive method does not need to. They prove several results (not that interesting though): column sampling is best for rank 1 approximation (no one will use rank 1 approximation I guess...); for a given form, neither is optimal approximation (that is why they are not good enough?); under some cases (looks not useful in practice), Nystrom's recovery is exact.&lt;br /&gt;&lt;br /&gt;The improvement of approximation is not salient though, but I guess the efficiency might be improved.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-3058241698189766978?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/3058241698189766978/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=3058241698189766978' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3058241698189766978'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3058241698189766978'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/on-sampling-based-approximate-spectral.html' title='On Sampling-based Approximate Spectral Decomposition'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-4991912409707285677</id><published>2009-07-28T20:24:00.003+08:00</published><updated>2009-07-28T21:16:49.805+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='convolutional networks'/><category scheme='http://www.blogger.com/atom/ns#' term='deep belief network'/><title type='text'>Convolutional Deep Belief Networks for Scalable Unsupervised Learning of Hierarchical Representations</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/571.pdf"&gt;Honglak Lee, Roger Grosse, Rajesh Ranganath and Andrew Y. Ng&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This can be viewed as a deep version of convolutioal neural network (I am not quite familiar with this). In each layer (in a sense), there are two sublayers, one for detection, convolving the input with several filters, the other for max-pooling, shrinking (resize the image) the convolved sublayer. Therefore on a whole the layer's input is an image and the output is several convolved and downsampled images. They layer this kind of nets and get the deep version.&lt;br /&gt;&lt;br /&gt;Let's see what we have to get. For each convolving sublayer, we have to train a convolution kernel and the corresponding biases (as in RBM). The change is the max-pooling layer. Each neuron in the max pooling layer only connects to a fixed size (a small patch, e.g. 2x2) of neurons in the convolutional sublayer. Since the neurons in the convolutional sublayer could be 0-1 and the max-pooling means only if none of the input neurons fires the output is 0, from the outside it is like a big neuron which can take multiple values (e.g. 2x2+1 = 5). The we may do as RBM, writing down the energy function, converting it to probability, formulating the likelihood and using CD learning with a sparsity penalty.&lt;br /&gt;&lt;br /&gt;The good idea behind the structure is that we first get some useful filters, then parts of the object and later the whole objects. The features learned with the model gives good results on several data sets.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-4991912409707285677?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/4991912409707285677/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=4991912409707285677' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4991912409707285677'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4991912409707285677'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/convolutional-deep-belief-networks-for.html' title='Convolutional Deep Belief Networks for Scalable Unsupervised Learning of Hierarchical Representations'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-7410941036253338646</id><published>2009-07-23T08:46:00.003+08:00</published><updated>2009-07-23T09:11:40.860+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='ranking'/><title type='text'>Evaluating Search Engines by Modeling the Relationship Between Relevance and Clicks</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://books.nips.cc/papers/files/nips20/NIPS2007_0754.pdf"&gt;Ben Carterette and Rosie Jones&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper tells us about a method to evaluating two ranking reseults based on clicks of users. We know the ranking affects exposures of the links, i.e. the higher rank one item has, the more attension human beings pay to. Therefore it is more reasonable to use a discounted relavance called discounted cumulative gain (DCG)&lt;pre lang="eq.latex"&gt;\mathrm{DCG}_l = \mathrm{rel}_1 + \sum_{i = 2}^l \frac{\mathrm{rel}_i}{\log_2 i}.&lt;/pre&gt;That is, the rank 1 item has no discount while rank &lt;code lang="eq.latex"&gt;i&lt;/code&gt; item has a discount ratio of &lt;code lang="eq.latex"&gt;1/\log_2 i&lt;/code&gt;.&lt;br /&gt;&lt;br /&gt;So if we want to compare two ranking results, we have to calculate the relevance score given by given the query. This is not always known. People might have been asked to score the documents with discrete values (on a scale of 5, e.g.) but not all documents will be marked. We model this scale with a multinomial distribution and simulated the scoring procedure and compare the two DCG values. If for 95% cases, one is higher than the other, we might assert it is better.&lt;br /&gt;&lt;br /&gt;The author proposed quite a simple model to model &lt;code lang="eq.latex"&gt;\Pr(X_i \mid q, c, )&lt;/code&gt;. It is an ordinal logistic regression model with linear and quadratic features. So when we trained the model, we can simulate the DCG and compare two rankings.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-7410941036253338646?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/7410941036253338646/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=7410941036253338646' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7410941036253338646'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7410941036253338646'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/evaluating-search-engines-by-modeling.html' title='Evaluating Search Engines by Modeling the Relationship Between Relevance and Clicks'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-8099858693881117205</id><published>2009-07-23T08:30:00.003+08:00</published><updated>2009-07-23T08:45:45.584+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='dynamic system'/><category scheme='http://www.blogger.com/atom/ns#' term='optimization'/><category scheme='http://www.blogger.com/atom/ns#' term='Hopfield networks'/><category scheme='http://www.blogger.com/atom/ns#' term='MRF'/><title type='text'>Herding Dynamical Weights to Learn</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/447.pdf"&gt;Max Welling&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper talks a novel way of learning MRF such as Hopfield networks (is that also a Boltzmann machine). For these models, the primal-dual structure of maximum entrpy learning and maximum likelihood estimator tells the possible learning algorithms, e.g. gradient-based, since though the optimization is constrained in the former form, it doesn't in the dual form.&lt;br /&gt;&lt;br /&gt;The gradient-based algorithms require solving the following inference problem&lt;pre lang="eq.latex"&gt;w_{\alpha}^{(t+1)} = w_\alpha^{(t)} + \eta( \bar{g}_\alpha - \mathbb{E}[g_\alpha]_P)&lt;/pre&gt;where the expectation is computed in the given model. The common ways of computing the expectation include a Gibbs sampler (MCMC) or mean field approximation (usually not good). For certain models, such as RBM, we might think about Hinton's contrastive divergence, but still we need non-deterministic algorithms.&lt;br /&gt;&lt;br /&gt;The herding algorithm the author proposed here is a deterministic algorithm. It takes the limit of the annealing version of the negated log-likelihood function and it results in a tipi function. He then formulated the herding algorithm as first maximizing (due to the limit) and resulting in some pseudo samples and then calculating the gradient based on these pseudo samples.&lt;br /&gt;&lt;br /&gt;I am not familiar with the dynamic system and the intrinsic idea behind this stuff though.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-8099858693881117205?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/8099858693881117205/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=8099858693881117205' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8099858693881117205'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8099858693881117205'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/herding-dynamical-weights-to-learn.html' title='Herding Dynamical Weights to Learn'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5946502248239316360</id><published>2009-07-22T15:14:00.001+08:00</published><updated>2009-07-22T15:20:19.513+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='texture'/><title type='text'>Statistical Geometrical Features for Texture Classification</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://eprints.ecs.soton.ac.uk/333/"&gt;Yan Qiu Chen, Marx S. Nixon and David W. Thomas&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper simply provides us with 16 features for texture classification. Well... try them in some applications then.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5946502248239316360?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5946502248239316360/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5946502248239316360' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5946502248239316360'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5946502248239316360'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/statistical-geometrical-features-for.html' title='Statistical Geometrical Features for Texture Classification'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5924687601828220796</id><published>2009-07-20T14:45:00.005+08:00</published><updated>2009-07-22T15:13:27.166+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='optimization'/><title type='text'>An Efficient Projection for L_{1, infty} Regularization</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/475.pdf"&gt;Ariadna Quattoni, Xavier Carreras, Michael Collins amd Trevor Darrell&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This optimization problem is a little different from the L1 penalized version, in that the variables to optimized is a matrix, and the corresponding penalty is the so-called &lt;code lang="eq.latex"&gt;L_{1, \infty}&lt;/code&gt; norm,&lt;pre lang="eq.latex"&gt;\| X \|_{1, \infty} = \sum_{i = 1}^m \max_j X_{i, j}.&lt;/pre&gt;The optimization is based on &lt;a href="http://paperscanner.blogspot.com/2009/07/efficient-projections-onto-l1-ball-for.html"&gt;a previous ICML 07 paper&lt;/a&gt;. But now we are dealing with matrices instead of vectors. For example, the toy optimization problem is modified to its matrix version&lt;pre lang="eq.latex"&gt;\min_{B, \mu} \frac{1}{2} \sum_{i, j} (A_{i, j} - B_{i, j})^2,&lt;/pre&gt;such that &lt;code lang="eq.latex"&gt;\forall i, j: B_{i, j} \leq \mu_i&lt;/code&gt;, &lt;code lang="eq.latex"&gt;\sum_i \mu = C&lt;/code&gt; and nonnegative constraints for &lt;code lang="eq.latex"&gt;B&lt;/code&gt; given a nonnegative matrix &lt;code lang="eq.latex"&gt;A&lt;/code&gt;.&lt;br /&gt;&lt;br /&gt;But soemhow there seems to be nothing else new.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5924687601828220796?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5924687601828220796/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5924687601828220796' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5924687601828220796'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5924687601828220796'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/efficient-projection-for-l1-infty.html' title='An Efficient Projection for L_{1, infty} Regularization'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-8681239413292253877</id><published>2009-07-20T13:28:00.004+08:00</published><updated>2009-07-20T13:38:28.831+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='active learning'/><category scheme='http://www.blogger.com/atom/ns#' term='ranking'/><title type='text'>Support Vector Machine Learning for Image Retrieval</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://research.microsoft.com/en-us/um/people/leizhang/paper/icip01.pdf"&gt;Lei Zhang, Fuzong Lin and Bo Zhang&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This is actually a vision paper. I am not sure whether the active learning is really just a version. In a query of similar images, the user labels some relevant and irrelevant  images. A SVM is then used to rank the images correspondingly. This will help the query though but will not help the later search. A possible improvement is adding some manifold regularization. Quite contrary to my expectation. Hmm...&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-8681239413292253877?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/8681239413292253877/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=8681239413292253877' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8681239413292253877'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8681239413292253877'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/support-vector-machine-learning-for.html' title='Support Vector Machine Learning for Image Retrieval'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-323418071598346733</id><published>2009-07-19T10:49:00.002+08:00</published><updated>2009-07-20T13:28:34.234+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='kernel'/><category scheme='http://www.blogger.com/atom/ns#' term='independence test'/><title type='text'>Regression by Dependence Minimization and its Application to Causal Inference in Additive Noise Models</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/279.pdf"&gt;Joris Mooij, Dominik Janzing, Jonas Peters and Bernhard Scholkopf&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper follows the &lt;a href="http://paperscanner.blogspot.com/2009/07/nonlinear-causal-discovery-with.html"&gt;NIPS 08 paper&lt;/a&gt;, with a different regressor. We know HSIC could be used for dependence maximization as well as independence maximization. Here we simply use its dependence. Basically there are two ways of doing regression, one to minimize the dependence of the residual and the input, the other to maximize the dependence of the response and the prediction (is that possible?)&lt;br /&gt;&lt;br /&gt;With HSIC it is easier to minimize the dependence of the response and the residue. The pro is that we do not need to specify the additive noise's distribution. In many regression problems, we actually assume the noise is a Gaussian. Now we may forget about the possible violation of this assumption. But the con is that now the optimization of the model will be much more difficult.&lt;br /&gt;&lt;br /&gt;The good thing about HSIC is that we have a statistical test, which allows us to judge whether it is statistically reliable to assert the independence. I am not sure whether we have a similar statistic for KDR-like terms.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-323418071598346733?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/323418071598346733/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=323418071598346733' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/323418071598346733'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/323418071598346733'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/regression-by-dependence-minimization.html' title='Regression by Dependence Minimization and its Application to Causal Inference in Additive Noise Models'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-798972050810761510</id><published>2009-07-19T10:35:00.003+08:00</published><updated>2009-07-19T10:49:23.814+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='causal inference'/><category scheme='http://www.blogger.com/atom/ns#' term='Gaussian process'/><category scheme='http://www.blogger.com/atom/ns#' term='kernel'/><category scheme='http://www.blogger.com/atom/ns#' term='independence test'/><title type='text'>Nonlinear Causal Discovery with Additive Noise Models</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.kyb.mpg.de/publications/attachments/NIPS2008-Hoyer-neu_5406%5B0%5D.pdf"&gt;Patrik O. Hoyer, Dominik Janzing, Joris Mooij, Jonas Peters and Bernhard Scholkopf&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper talks about making causal inference from data. We know if two r.v.s have an additive linear relationship (with noise), the reverse relationship also holds. If there is a non-linear relationship, our causal inference will be easier since the inverse might not hold. This is the main result of this paper, finding if there exists a reverse relationship, the probability must satisfy a differential equation. From this equation we know, if &lt;code lang="eq.latex"&gt;\nv''' = \xi''' = 0&lt;/code&gt;, where the two function are the logarithm of the PDF of &lt;code lang="eq.latex"&gt;x&lt;/code&gt; and the additive noise &lt;code lang="eq.latex"&gt;n&lt;/code&gt;, &lt;code lang="eq.latex"&gt;f&lt;/code&gt; must be linear.&lt;br /&gt;&lt;br /&gt;This also suggests a way of making causal inference on DAGs. This leaves us to find a powerful regressor to capture the possible relationship between two r.v.s. The authors chooses GPR. Then the residue should be statistically independent of &lt;code lang="eq.latex"&gt;x&lt;/code&gt;, which can be tested with HSIC.&lt;br /&gt;&lt;br /&gt;However, if we are to find the latent structure instead of doing a statistical test, this search will be intolerable if the DAG size is beyond 7.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-798972050810761510?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/798972050810761510/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=798972050810761510' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/798972050810761510'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/798972050810761510'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/nonlinear-causal-discovery-with.html' title='Nonlinear Causal Discovery with Additive Noise Models'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-9058770648756693413</id><published>2009-07-17T08:45:00.003+08:00</published><updated>2009-07-17T08:52:01.767+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='product of experts'/><category scheme='http://www.blogger.com/atom/ns#' term='boosting'/><title type='text'>Boosting Products of Base Classifiers</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/231.pdf"&gt;Balazs Kegl and Robert Busa-Fekete&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;Boosting is quite a useful technique for practical tasks, since we may get robust and high-precision classifier or regressor by combining the weak ones. Among them AdaBoost.MH is the most popular version (a little different from the one for binary version). This paper trains products of base classifiers instead of tree or MoE-like ones (considering Hinton's claim of the advantages of PoE over MoE). The design of the algorithm is not difficult though. The results on MNIST shows it is the second best algorithm (the best is DBN).&lt;br /&gt;&lt;br /&gt;Well, implement one :-p&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-9058770648756693413?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/9058770648756693413/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=9058770648756693413' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/9058770648756693413'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/9058770648756693413'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/boosting-products-of-base-classifiers.html' title='Boosting Products of Base Classifiers'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-518865051779500935</id><published>2009-07-16T09:55:00.004+08:00</published><updated>2009-07-20T14:43:02.788+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='optimization'/><title type='text'>Efficient Projections onto the l1-Ball for Learning in High Dimensions</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://icml2008.cs.helsinki.fi/papers/361.pdf"&gt;John Duchi, Shai Shalev-Shwartz, Yoram Singer and Tushar Chandra&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;The projection on L1 ball is another way of computing L1 norm penalty for certain regularization problem. That is to say&lt;pre lang="eq.latex"&gt;\min_w \Omega(w) + \lambda \| w \|_1 &lt;/pre&gt;, we may optimize&lt;pre lang="eq.latex"&gt;\min_{w} \Omega(w) \quad \text{s.t. } \| w \|_1 \leq z&lt;/pre&gt;where &lt;code lang="eq.latex"&gt;z&lt;/code&gt; is a constant. The former one can be seen as a series of optimization problem indexed by the regularization parameter &lt;code lang="eq.latex"&gt;\lambda&lt;/code&gt; and the latter is indexed by the constant &lt;code lang="eq.latex"&gt;z&lt;/code&gt;. We can show that two forms are equivalent.&lt;br /&gt;&lt;br /&gt;Theirfore, according to the latter, the optimization becomes finding a minimizer of the loss function on the L1 ball. If the minimizer happens to be inside the ball, it is the solution; otherwise, it must be on the boundary of the ball. So the difficult part becomes how can we project the vectors onto the L1 ball efficiently. This explains why the solution should be sparse as well.&lt;br /&gt;&lt;br /&gt;The first simple case is&lt;pre lang="eq.latex"&gt;\min_w \frac{1}{2} \| w - v \|_2^2 \quad \text{s.t. } \sum_{i = 1}^n w_i = z, w_i \geq 0,&lt;/pre&gt;where &lt;code lang="eq.latex"&gt;v&lt;/code&gt; inside the positive cone. With a little analysis, we know this is actually very simple. When &lt;code lang="eq.latex"&gt;\| v\|_1 \leq z&lt;/code&gt;, the solution is obvious. Otherwise, we should decrease the coordinates. Using Lagrange multiplier, it is seen that each coordinate are shrunk with the same value. Therefore the norm &lt;code lang="eq.latex"&gt;\| w \|_1&lt;/code&gt; will decrease, &lt;code lang="eq.latex"&gt;n \theta, (n-1)\theta, \ldots&lt;/code&gt;, the slope changes whenever a coordinates vanishes. Therefore we stop when the piecewise linear function meets &lt;code lang="eq.latex"&gt;\| w \| = z&lt;/code&gt;. We just need to find the intersection. If we implement it with a brute search, it ends up with a &lt;code lang="eq.latex"&gt;O(n \log n)&lt;/code&gt; algorithm, but with a little care (using the quick sort idea), the magic turns it into a &lt;code lang="eq.latex"&gt;O(n)&lt;/code&gt; algorithm.&lt;br /&gt;&lt;br /&gt;Then we come to other cases when &lt;code lang="eq.latex"&gt;v&lt;/code&gt; can be in other orthants, due to the symmetry. Finally, we will use a tree structure to boost up the speed. In our implementation of a gradient-based algorithm, such as&lt;pre lang="eq.latex"&gt;w^{(t + 1)} = \Pi_W \Big( w^{(t)} + g^{(t)} \Big),&lt;/pre&gt;we calculate the projection onto the L1 ball, i.e. &lt;code lang="eq.latex"&gt;\Pi_W(\cdot)&lt;/code&gt;. The idea is to use a red-black tree (gonna review these structure). The the projection can be done in the time which scales linearly in the non-zero entries of &lt;code lang="eq.latex"&gt;g^{(t)}&lt;/code&gt; and logarithmically in the total number of features &lt;code lang="eq.latex"&gt;n&lt;/code&gt;.&lt;br /&gt;&lt;br /&gt;Let's compare this one with the &lt;a href="http://paperscanner.blogspot.com/2009/07/gradient-descent-with-sparsification.html"&gt;ICML 09 paper&lt;/a&gt; and several other papers later.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-518865051779500935?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/518865051779500935/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=518865051779500935' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/518865051779500935'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/518865051779500935'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/efficient-projections-onto-l1-ball-for.html' title='Efficient Projections onto the l1-Ball for Learning in High Dimensions'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5577416287937161713</id><published>2009-07-16T08:29:00.002+08:00</published><updated>2009-07-16T08:56:26.371+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='discriminative model'/><category scheme='http://www.blogger.com/atom/ns#' term='clustering'/><title type='text'>Discriminative k-Metrics</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/179.pdf"&gt;Arthur Szlam and Guillermo Sapiro&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper tells us about an old clustering technique called k q-flats. It can be regarded as a generalization of k-means algorithm but we find a q-dimensional space to project onto instead of a centroid for each cluster,&lt;pre lang="eq.latex"&gt;\sum_{j = 1}^K \sum_{x \in K_j} \| x - P_{F_j} x \|^2&lt;/pre&gt;This can be solved with k-means-like (EM-like) algorithms. Basically if we want to apply it to classification problems, we may train a k q-flats for each class but this model lacks discriminative power.&lt;br /&gt;&lt;br /&gt;But this can be solved in a natural way&lt;pre lang="eq.latex"&gt;\sum_{i = 1}^c\sum_{j = 1}^K \left( \sum_{x \in K_{i, j}}g_1(\| F_{i, j}^\top x \|^2) + \sum_{x \not \in C_i} g_2(\| F_{i, j}^\top x\|^2)\right)&lt;/pre&gt;A little difference is the optimization has to be done with gradient. The selected &lt;code lang="eq.latex"&gt;g_i&lt;/code&gt; are Hinge-like loss funtions and the derivatives are simple piecewise constant functions.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5577416287937161713?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5577416287937161713/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5577416287937161713' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5577416287937161713'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5577416287937161713'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/discriminative-k-metrics.html' title='Discriminative k-Metrics'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-7821138687855367613</id><published>2009-07-15T12:18:00.000+08:00</published><updated>2009-07-15T17:42:29.456+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='probabilistic graphical model'/><category scheme='http://www.blogger.com/atom/ns#' term='sparsity'/><category scheme='http://www.blogger.com/atom/ns#' term='bayesian framework'/><title type='text'>On Primal and Dual Sparsity of Markov Networks</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/274.pdf"&gt;Jun Zhu and Eric P. Xing&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper mainly taks about the relationship of several M3Ns. The primal and dual sparsity are caused by L1 norm penalty and the constraints (according to KKT conditions). Therefore adding L1 norm penalty to M3N will cause both sparsities, which increases generalization capacity and selects important features.&lt;br /&gt;&lt;br /&gt;The LapM3N proposed by the authors earlier is a Bayesian version of M3N. The MAP estimator of LapM3N would go as M3N with different penalties, e.g. L2 norm corresponding to a Gaussian prior and L1 norm corresponding to a Laplace prior with parameter going to infinity.&lt;br /&gt;&lt;br /&gt;Another relationship of L1 norm is found to sparse Bayesian learning, since adding a prior for each parameter of M3N (think about RVM) would result a sparse solution. The adaptive M3N will yield the same result as the L1-normed M3N.&lt;br /&gt;&lt;br /&gt;The authors proposed an EM-like algorithm for training L1-normed M3N. Obviously, it would have connection to variational Bayesian approximation.&lt;br /&gt;&lt;br /&gt;Maybe we should write our own structured learning tools.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-7821138687855367613?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/7821138687855367613/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=7821138687855367613' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7821138687855367613'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7821138687855367613'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/on-primal-and-dual-sparsity-of-markov.html' title='On Primal and Dual Sparsity of Markov Networks'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-285015489538593272</id><published>2009-07-13T13:47:00.004+08:00</published><updated>2009-07-15T12:16:39.314+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='sparsity'/><category scheme='http://www.blogger.com/atom/ns#' term='GPU'/><category scheme='http://www.blogger.com/atom/ns#' term='deep belief network'/><title type='text'>Large-scale Deep Unsupervised Learning using Graphics Processors</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/218.pdf"&gt;Rajat Raina, Anand Madhavan and Andrew Ng&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This is a paper on implementation of DBN and sparse coding algorithms with GPU. The finer-grain parallelism provided by the modern GPU outperforms CPU architecture. The bottleneck is the IO, from main memory to the memory inside the video adapter. The finer-grain parallelism allows us to deal with data parallelism and the data are divided for each block and the job assigned to each block is further divided into threads' labor.&lt;br /&gt;&lt;br /&gt;They said they would provide their code online. I have not found it.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-285015489538593272?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/285015489538593272/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=285015489538593272' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/285015489538593272'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/285015489538593272'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/large-scale-deep-unsupervised-learning.html' title='Large-scale Deep Unsupervised Learning using Graphics Processors'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5564645964913233256</id><published>2009-07-11T10:23:00.006+08:00</published><updated>2009-07-11T15:28:22.357+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='roughly scanned'/><category scheme='http://www.blogger.com/atom/ns#' term='optimization'/><category scheme='http://www.blogger.com/atom/ns#' term='sparsity'/><category scheme='http://www.blogger.com/atom/ns#' term='compressive sampling'/><title type='text'>Gradient Descent with Sparsification: An Iterative Algorithm for Sparse Recovery with Restricted Isometry Property</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/115.pdf"&gt;Rahul Garg and Rohit Khandekar&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper tells one story about the following optimization problem arising from compressive sampling&lt;pre lang="eq.latex"&gt;\min_x \quad \| x \|_0 \qquad \text{s.t.} \quad \Phi x = y,&lt;/pre&gt;which is usually approximated with the following version&lt;pre lang="eq.latex"&gt;\min_x \quad \| x \|_1 \qquad \text{s.t.} \quad \Phi x = y.&lt;/pre&gt;We have quite a few algorithms for solving the optimization under different conditions. This paper proposes the following gradient-based algorithm&lt;pre lang="eq.latex"&gt;x \leftarrow H_s \left( x + \frac{1}{\gamma} \cdot \Phi^\top (y - \Phi x)\right)&lt;/pre&gt;where &lt;code lang="eq.latex"&gt;H_s&lt;/code&gt; take the largest (absolute value) &lt;code lang="eq.latex"&gt;s&lt;/code&gt; elements of &lt;code lang="eq.latex"&gt;x&lt;/code&gt; and other as 0.&lt;br /&gt;&lt;br /&gt;The theoretical result is their small computation cost (&lt;code lang="eq.latex"&gt;K&lt;/code&gt; in each iteration), fast convergence rate (&lt;code lang="eq.latex"&gt;2L / \log(\frac{1 - \delta_{2s}}{2\delta_{2s}})&lt;/code&gt;) and lenient recovery condition (&lt;code lang="eq.latex"&gt;\delta_{2s} &amp;lt; \frac{1}{3}&lt;/code&gt;). I think it might be a good algorithm if we late want to make some lasso experiments.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5564645964913233256?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5564645964913233256/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5564645964913233256' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5564645964913233256'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5564645964913233256'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/gradient-descent-with-sparsification.html' title='Gradient Descent with Sparsification: An Iterative Algorithm for Sparse Recovery with Restricted Isometry Property'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-3745146751922755781</id><published>2009-07-10T13:38:00.006+08:00</published><updated>2009-07-16T09:55:02.167+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='discriminative model'/><category scheme='http://www.blogger.com/atom/ns#' term='probabilistic graphical model'/><title type='text'>Sparse Higher Order Conditional Random Fields for Improved Sequence Labeling</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/62.pdf"&gt;Xian Qian, Xiaoqian Jiang, Qi Zhang, Xuanjing Huang and Lide Wu&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper adds higher-order features to the CRF model with exact inference. The feasibility is caused by the sparsity. That's to say although we include higher order features in the feature vector, it seldom fires. So in the inference stage (since we have to calculate the gradient), the computational complexity will not go exponentially and the sparsity makes the exact inference possible. This paper extends the terms for linear-chain CRF into configurations which can be used for higher-order features. That's the main contribution.&lt;br /&gt;&lt;br /&gt;I will write something related to this problem soon. First I have to finish some optimization code.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-3745146751922755781?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/3745146751922755781/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=3745146751922755781' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3745146751922755781'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3745146751922755781'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/sparse-higher-order-conditional-random.html' title='Sparse Higher Order Conditional Random Fields for Improved Sequence Labeling'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-90499863365972781</id><published>2009-07-10T13:15:00.003+08:00</published><updated>2009-07-10T13:38:43.405+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='novel idea'/><title type='text'>Curriculum Learning</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/119.pdf"&gt;Yoshua Bengio, Jerome Louradour, Ronan Collobert and Jason Weston&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;The so-called curriculum learning states the idea of learning things from simple ones to difficult ones. The gradually hardening tasks help build a classifier with better generalization capacity.&lt;br /&gt;&lt;br /&gt;There are evidence from cognitive sciences as well as from machine learning itself. In optimization theory, the famous continuation mathod actually has the same spirit. Another example is the deep belief nets, in which the greedy pretraining can be seen as a simpler task than the succedent fine tuning. The examples provided by the authors comes from simple toy experiments in low-dimensional space (train two Bayesian classifier with or without difficult examples), a shape learning task with neural nets (with or without a switching epoch, at which the training set is switched from simple to difficult samples), an NLP example.&lt;br /&gt;&lt;br /&gt;Their claim includes several messages:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;difficult examples may not be useful;&lt;/li&gt;&lt;li&gt;better curriculum might speed up online learning and guid the result to the region where better generalization can be found;&lt;/li&gt;&lt;li&gt;the idea might be connected with active learning and boosting.&lt;/li&gt;&lt;/ul&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-90499863365972781?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/90499863365972781/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=90499863365972781' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/90499863365972781'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/90499863365972781'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/curriculum-learning.html' title='Curriculum Learning'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-6475768240426484728</id><published>2009-07-09T10:52:00.001+08:00</published><updated>2009-07-11T10:22:18.191+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='graph embedding'/><category scheme='http://www.blogger.com/atom/ns#' term='semi-supervised learning'/><title type='text'>Graph Construction and b-Matching for Semi-supervised Learning</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/188.pdf"&gt;Tony Jebara, Jun Wang and Shi-fuh Chang&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This is an ``advertisement'' paper for their 2007 paper on b-matching. In general b-matching is a graph construction algorithm as k-NN. This paper enumerate all kinds of combinations of label diffusion (propagation?):&lt;br /&gt;&lt;ul&gt;&lt;li&gt;graph construction, kNN and b-matching&lt;/li&gt;&lt;li&gt;weight, Gaussian kernel, LLR (like LLE).&lt;/li&gt;&lt;li&gt;diffusion algorithms, GRF, LGC and GTAM.&lt;/li&gt;&lt;/ul&gt;&lt;br /&gt;I don't buy their conclusion. Yes graph construction might be important to later algorithms, but the b-matching result doesn't seem so different from k-NN. Maybe we have to try for ourselves.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-6475768240426484728?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/6475768240426484728/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=6475768240426484728' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6475768240426484728'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6475768240426484728'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/graph-construction-and-b-matching-for.html' title='Graph Construction and b-Matching for Semi-supervised Learning'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-3676551049993763171</id><published>2009-07-09T09:28:00.002+08:00</published><updated>2009-07-09T10:51:51.732+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='graph embedding'/><category scheme='http://www.blogger.com/atom/ns#' term='semidefinite programming'/><category scheme='http://www.blogger.com/atom/ns#' term='manifold learning'/><title type='text'>Minimum Volume Embedding</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.columbia.edu/%7Ejebara/papers/aistatsMVE07.pdf"&gt;Blake Shaw and Tony Jebara&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This is another piece of work by the authors of the previously scanned paper. This work is mainly based on the MVU paper, where the graph is embedded with isometry constraints (linear for the Gram matrix) and maximized variance (the trace of the Gram matrix). Therefore the Gram matrix can be obtained via SDP optimization techniques.&lt;br /&gt;&lt;br /&gt;But the variance to be maximized is harmful since it might cause the variance in all directions to increase, which is not necessary (as is illustrated in the example of the paper). The author takes the difference of the eigenvalues of the Gram matrix to be optimized&lt;pre lang="eq.latex"&gt;\max \sum_{i= 1}^d \lambda_i - \sum_{i = d+1}^N&lt;/pre&gt;where &lt;code lang="eq.latex"&gt;\lambda_i&lt;/code&gt; are the eigenvalues of &lt;code lang="eq.latex"&gt;K&lt;/code&gt; and &lt;code lang="eq.latex"&gt;K&lt;/code&gt; must satisfy the same contraints as MVU.&lt;br /&gt;&lt;br /&gt;To solve the problem, the authors proposed an iterative algorithm based on SDP. In each iteration, the eigen vectors are renewed by PCA of the Gram matrix and the Gram matrix is updated with SDP. It can be proved the algorithm will converge to a local minima. This will force those eigen values irrelevant to the embedding to zero and therefore cause a minimum volume embedding.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-3676551049993763171?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/3676551049993763171/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=3676551049993763171' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3676551049993763171'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3676551049993763171'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/minimum-volume-embedding.html' title='Minimum Volume Embedding'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-7842757737555253654</id><published>2009-07-09T08:22:00.002+08:00</published><updated>2009-07-09T09:20:08.575+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='graph embedding'/><category scheme='http://www.blogger.com/atom/ns#' term='semidefinite programming'/><category scheme='http://www.blogger.com/atom/ns#' term='manifold learning'/><title type='text'>Structure Preserving Embedding</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www.cs.mcgill.ca/%7Eicml2009/papers/418.pdf"&gt;&lt;span style="font-style: italic;"&gt;Blake Shaw and Tony Jebra&lt;/span&gt;&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper proposes the concept structure preservaing embedding: given an algorithm to construct a graph &lt;code lang="eq.latex"&gt;\mathcal{G}&lt;/code&gt;, it would yield the same graph as the given affinity matrix &lt;code lang="eq.latex"&gt;A_0&lt;/code&gt; with the computed Gram matrix &lt;code lang="eq.latex"&gt;K&lt;/code&gt;.&lt;br /&gt;&lt;br /&gt;There are two kind of algorithms to construct the graph:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;kNN and &lt;code lang="eq.latex"&gt;\epsilon&lt;/code&gt;-ball; we can find linear constraints for &lt;code lang="eq.latex"&gt;K&lt;/code&gt; which ensure the  nearby samples are nearer than other samples (the constrains are at most &lt;code lang="eq.latex"&gt;O(N^2)&lt;/code&gt;).&lt;/li&gt;&lt;li&gt;Maximum weight subgraph, &lt;code lang="eq.latex"&gt;b&lt;/code&gt;-matching subgraph, maximum weight spanning tree. The number of constrains might be exponential in &lt;code lang="eq.latex"&gt;N&lt;/code&gt;.&lt;/li&gt;&lt;/ul&gt;&lt;br /&gt;The objective is &lt;code lang="eq.latex"&gt;\mathrm{tr}(K A)&lt;/code&gt;, subject to &lt;code lang="eq.latex"&gt;\mathrm{tr}(K) \leq 1&lt;/code&gt; and &lt;code lang="eq.latex"&gt;K \succeq 0&lt;/code&gt;. The authors prove that under these conditions, the Gram matrix has rank 1. The embedding is calculated with constraints with a common slack variable &lt;code lang="eq.latex"&gt;\xi&lt;/code&gt;, which allows possible violation of the constraints. Then the resulted embedding might has more dimensions.&lt;br /&gt;&lt;br /&gt;For the first category of constraints, SDP can be directly applied while for the second category, we first use SDP without constraints and then add most violated constraints one-by-one, each time the model is updated with SDP until convergence is observed.&lt;br /&gt;&lt;br /&gt;This technique is best regarded as a visualization technique (for graphs) though it could also be used in dimensionality reduction tasks for classification. The experiments show with SP contraints added to MUV and MVE, the classification rate improves.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-7842757737555253654?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/7842757737555253654/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=7842757737555253654' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7842757737555253654'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7842757737555253654'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/structure-preserving-embedding.html' title='Structure Preserving Embedding'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-6911359041838164116</id><published>2009-07-08T15:38:00.004+08:00</published><updated>2009-07-08T17:01:55.786+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='roughly scanned'/><category scheme='http://www.blogger.com/atom/ns#' term='information theory'/><category scheme='http://www.blogger.com/atom/ns#' term='Bregman divergence'/><category scheme='http://www.blogger.com/atom/ns#' term='kernel'/><category scheme='http://www.blogger.com/atom/ns#' term='metric learning'/><title type='text'>Geometric-aware Metric Learning</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/193.pdf"&gt;Zhengdong Lu, Prateek Jain and Inderjit S. Dhillon&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This is an extension to the &lt;a href="http://paperscanner.blogspot.com/2009/07/information-theoretic-metric-learning.html"&gt;ICML 07 paper&lt;/a&gt;, with introduction of graph regularization. The idea is to find a pair of kernel matrices, one from task-dependent kernel set and the other from a parametrized data-dependent kernels. The two sets are quite different. The former is used in later classification or related tasks and they choose the Mahalanobis kernel. The later contains locality information and is created with the graph kernels (a subspace spanned by the eigen vectors corresponding to the smallest eigenvalues of the Laplacian matrix). To measure the similarity of the two distance, the Bregman divergence is employed. the difference between the current optimization and the one in the ICML 07 paper is that the other matrix is not fixed. The rough idea is to update them alternatively (just as in LSQ in tensor).&lt;br /&gt;&lt;br /&gt;We may change the data-dependent kernel for different tasks (e.g. unsupervised tasks using graph kernels, supervised tasks using labels). There are connections with regularization theory: the solution has a representation of a combination of graph kernel and another term which can be interpreted as a regularizer. Another possible connection is to GP: GP could be regarded as a special case of the proposed framework. I think this topic is by then very interesting. I will come back later to this topic when I have time.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-6911359041838164116?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/6911359041838164116/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=6911359041838164116' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6911359041838164116'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6911359041838164116'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/geometric-aware-metric-learning.html' title='Geometric-aware Metric Learning'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-2646052036190296339</id><published>2009-07-08T15:09:00.004+08:00</published><updated>2009-07-08T15:33:52.058+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='probabilistic graphical model'/><category scheme='http://www.blogger.com/atom/ns#' term='graph'/><category scheme='http://www.blogger.com/atom/ns#' term='expectation maximization'/><category scheme='http://www.blogger.com/atom/ns#' term='manifold learning'/><title type='text'>Probabilistic Dyadic Data Analysis with Local and Global Consistency</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/34.pdf"&gt;Deng Cai, Xuanhui Wang and Xiaofei He&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper introduces a graph regularizer for PLSA. There are some interesting stories behind the NLP algorithms such as PLSA, LDA (latent Dirichlet allocation), I think, but I am not quite familiar with them. From the modeling aspect, they deal with the same thing. Given a term-document matrix, &lt;code lang="eq.latex"&gt;X&lt;/code&gt;, the element of which is the count of occurrences of a given word &lt;code lang="eq.latex"&gt;w_j&lt;/code&gt; in the document &lt;code lang="eq.latex"&gt;d_i&lt;/code&gt;. The generative model is each document is a mixture of multinomial distribution. Namely, we have several latent topics &lt;code lang="eq.latex"&gt;z_k&lt;/code&gt; such that a document is a mixture of the topics &lt;code lang="eq.latex"&gt;z_k&lt;/code&gt; with the mixing proportion &lt;code lang="eq.latex"&gt;\Pr(z_k \mid d_i)&lt;/code&gt;. Each topic decide a kind of ditribution for the words, namely &lt;code lang="eq.latex"&gt;\Pr(w_j \mid z_k)&lt;/code&gt;. The PLSA algorithm simply uses EM to train the mixture model.&lt;br /&gt;&lt;br /&gt;The LDA is not as far as I think. It's just a Bayesian version of PLSA. We know the natural extesion of mixture models in frequentists' domain is adding a Dirichlet prior for the proportion of the latent variable. And the more sophisticated technique is non-parametric Bayesian, adding a Dirichlet process as the prior. The fully Bayesian method need the posterior distribution which is approximated with (global) variation inference.&lt;br /&gt;&lt;br /&gt;The two algorithms do not take into consideration the local information. This paper simply adds a regularizer to the PLSA loss (-log likelihood). Here the authors assume if the documents are similar, their mixing proportions must be similar. So if &lt;code lang="eq.latex"&gt;d_{i_1}&lt;/code&gt; and &lt;code lang="eq.latex"&gt;d_{i_2}&lt;/code&gt; are similar, according to some rules (labels in classification or cosine of the angle between two tf-idf vectors in unsupervised case), the two distribution &lt;code lang="eq.latex"&gt;\Pr(z_k \mid d_{i_1})&lt;/code&gt; and &lt;code lang="eq.latex"&gt;\Pr( z_k \mid d_{i_2})&lt;/code&gt; are similar. This similarity is measured with the symmetric KL divergence:&lt;pre lang="eq.latex"&gt;\min -\sum_{i = 1}^N \sum_{j = 1}^M n(d_i, w_j) \log\sum_{k = 1}^K \Pr(w_j \mid z_k) \Pr(z_k \mid d_i) + \frac{\lambda}{2} \sum_{i_1, i_2 = 1}^N W_{i_1, i_2} \big( D(\Pr(z_k \mid d_{i_1})\parallel \Pr(z_k \mid d_{i_2})) + D(\Pr(z_k \mid d_{i_2})\parallel \Pr(z_k \mid d_{i_1})) \big)&lt;/pre&gt;The optimization can still be solved with EM with a little approximation. The difficult part is the maximization step.&lt;br /&gt;&lt;br /&gt;The result is really good: in unsupervised case, it is better than NMF and PLSA, LDA and even N-cut.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-2646052036190296339?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/2646052036190296339/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=2646052036190296339' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2646052036190296339'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/2646052036190296339'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/probabilistic-dyadic-data-analysis-with.html' title='Probabilistic Dyadic Data Analysis with Local and Global Consistency'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-4667801992579995481</id><published>2009-07-08T15:04:00.003+08:00</published><updated>2009-07-08T15:38:37.673+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='roughly scanned'/><category scheme='http://www.blogger.com/atom/ns#' term='Bregman divergence'/><category scheme='http://www.blogger.com/atom/ns#' term='metric learning'/><title type='text'>Information Theoretic Metric Learning</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.blogger.com/www.machinelearning.org/proceedings/icml2007/papers/404.pdf"&gt;Jason V. Davis, Brian Kulis, Prateek Jain, Suvrit Sra and Inderjit S. Dhillon&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper talks about a metric learning algorithm. The basic tool is the so-called Bregman divergence. the setting is to find a Mahalanobis distance, therefore a matrix &lt;code lang="eq.latex"&gt;A&lt;/code&gt; for inner product (&lt;code lang="eq.latex"&gt;x^\top A x&lt;/code&gt;). The trick is we need our matrix &lt;code lang="eq.latex"&gt;A&lt;/code&gt; is as close as to a given &lt;code lang="eq.latex"&gt;A_0&lt;/code&gt;, which is achieved via the KL divergence of two Gaussians with &lt;code lang="eq.latex"&gt;A_0&lt;/code&gt; and &lt;code lang="eq.latex"&gt;A&lt;/code&gt; as their covariance matrix respectively. We further have two sets, one including pairs of samples that are near (their distances are less than a threshold), the other including dis-similar ones (their distances are larger than another threshold). It can be derived the KL divergence is actually a form of Bregman divergence and equals &lt;code lang="eq.latex"&gt;\mathrm{LogDet}(A_0, A) = \mathrm{tr}(A A_0^{-1}) - \log \det A A_0^{-1} - n&lt;/code&gt;. The constraints are linear to the matrix &lt;code lang="eq.latex"&gt;A&lt;/code&gt; to be optimized. This distance is the only scale-invariant and the loss leads to uniform minimum variance unbiased estimator. As SVM, we may introduct slack variabls to tackle the infeasible cases. To solve this optimization, a former algorithm (ICML 2006 paper) is extended.&lt;br /&gt;&lt;br /&gt;There are some connection to the previous work to since it can be proved the low-rank kernel matrix is simply induced by the Mahalanobis kernel we find in the proposed metric learning algorithm. And the paper also addresses the online metric learning problem. They prove the proposed algorithm will converge.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-4667801992579995481?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/4667801992579995481/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=4667801992579995481' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4667801992579995481'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4667801992579995481'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/information-theoretic-metric-learning.html' title='Information Theoretic Metric Learning'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-6884506219186779253</id><published>2009-07-02T22:44:00.005+08:00</published><updated>2009-07-04T00:38:43.590+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='graph embedding'/><category scheme='http://www.blogger.com/atom/ns#' term='feature selection'/><category scheme='http://www.blogger.com/atom/ns#' term='unsupervised learning'/><title type='text'>Robust Feature Extraction via Information Theoretic Learning</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/21.pdf"&gt;Xiao-Tong Yuan and Bao-Gang Hu&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper is based on the so-called Renyi's entropy, which is defined as&lt;pre lang="eq.latex"&gt;H_2(p) = -log \int p^2(x) \,\mathrm{d} x&lt;/pre&gt;Given a random variable &lt;code lang="eq.latex"&gt;x&lt;/code&gt;'s sample, the empirical Renyi's entropy is estimated with kernel density estimator&lt;pre lang="eq.latex"&gt;\hat{H}_2(X) = - \log \hat{V}(X) + \text{const}, \qquad \hat{V}(X) = \sum_i \sum_j g(x_i - x_j, \sigma)&lt;/pre&gt;where &lt;code lang="eq.latex"&gt;g(x-z, \sigma) = \exp( -\| x - z \|^2 / 2\sigma^2)&lt;/code&gt;. &lt;code lang="eq.latex"&gt;\hat{V}(X)&lt;/code&gt; is called information potential. For two r.v.s, we have similar result,&lt;pre lang="eq.latex"&gt;\hat{H}_2(X_1, X_2) = -\log \hat{V}(X_1, X_2) + \text{const}, \qquad \hat{V}(X_1, X_2) = \sum_i \sum_j g(x_i^{(1)} - x_j^{(2)}, \sigma)&lt;/pre&gt;where the second term measures the correlation of two r.v.s.&lt;br /&gt;&lt;br /&gt;The proposed objective is to find a projection &lt;code lang="eq.latex"&gt;Y = WX&lt;/code&gt; such that&lt;pre lang="eq.latex"&gt;\max_{W} (1 - \lambda) \hat{V}(WX) + \lambda \hat{V}(WX, C) - \gamma \| W \|_\text{fro}^2&lt;/pre&gt;They find that when &lt;code lang="eq.latex"&gt;\lambda = 1&lt;/code&gt; we have a so-called robust M-estimator. The optimization algorithm they find is very similar to majorization minimization (auxilliary function). The interesting relationship they listed in the paper includes that with LPP. The iterative algorithm solves a LPP/LapRLS/SRDA in each iteration.&lt;br /&gt;&lt;br /&gt;I am afraid this looks quite like the unsupervised HSIC for SDR problem. I will check it soon.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-6884506219186779253?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/6884506219186779253/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=6884506219186779253' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6884506219186779253'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6884506219186779253'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/robust-feature-extraction-via.html' title='Robust Feature Extraction via Information Theoretic Learning'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-770611604136672438</id><published>2009-07-02T20:29:00.004+08:00</published><updated>2009-07-02T22:42:16.672+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='optimization'/><category scheme='http://www.blogger.com/atom/ns#' term='bayesian framework'/><title type='text'>A Majorization-Minimization Algorithm for (Multiple) Hyperparameters Learning</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a href="http://www.cs.mcgill.ca/%7Eicml2009/papers/20.pdf"&gt;Chuan-Sheng Foo, Chuong B. Do and Andrew Ng&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;First we must understand what the majorization-minimization (MM) algorithm is. It is in fact the auxilliary function method, which might be regarded as a generalization of EM algorithm. To minimize (maximize) a function &lt;code lang="eq.latex"&gt;L(x)&lt;/code&gt;, we find an upper (lower) bound for the objective which is more easier to minimize (maximize), &lt;code lang="eq.latex"&gt;Q(x; x')&lt;/code&gt; where the &lt;code lang="eq.latex"&gt;x'&lt;/code&gt; is the parameter for the upper (lower) bounding function. This can be seen as a generalization of the idea of CCCP, since in CCCP, we use linear functions to bound the convex (concave) functions and the parameter is simply the variable introduced by Legendre-Fenchel transform. And we require &lt;code lang="eq.latex"&gt;L(x) - Q(x; x')&lt;/code&gt; reaches its minimum (maximum) at &lt;code lang="eq.latex"&gt;x = x'&lt;/code&gt;. As we can see here, the following inequality holds&lt;pre lang="eq.latex"&gt;L(x) \leq Q(x; x') \qquad \text{or} \qquad L(x) \geq Q(x; x')&lt;/pre&gt;The idea is instead of optimizing &lt;code lang="eq.latex"&gt;L(x)&lt;/code&gt; directly due to its intractability, we may find suitable algorithm to optimize &lt;code lang="eq.latex"&gt;Q(x; x')&lt;/code&gt; where &lt;code lang="eq.latex"&gt;x' = x^{(n)}&lt;/code&gt;. Therefore for the minimization case we have the following inequality&lt;pre lang="eq.latex"&gt;L(x^{(n)}) = Q(x^{(n)}; x^{(n-1)}) + L(x^{(n)} - Q(x^{(n)}; x^{(n-1)})) \leq Q(x^{(n-1)}; x^{(n-1)}) + L(x^{(n-1)}) - Q(x^{(n-1)}; x^{(n-1)}) = L(x^{(n-1)})&lt;/pre&gt;where the inequality uses two optima, &lt;code lang="eq.latex"&gt;Q(x^{(n)}; x^{(n-1)})&lt;/code&gt; is minimum of &lt;code lang="eq.latex"&gt;Q(x; x^{(n-1)})&lt;/code&gt; and &lt;code lang="eq.latex"&gt;L(x) - Q(x; x^{(n-1)})&lt;/code&gt; reaches its maximum at &lt;code lang="eq.latex"&gt;x = x^{(n-1)}&lt;/code&gt;.&lt;br /&gt;&lt;br /&gt;Now let's come back to the Bayes learning problem. We know to select a proper hyperparameter, either we use cross validation, or we rely on maximization of Type II likelihood (EM solves it :-p). Here the author simply introduce another prior for the hyperparameter and integrate it out. E.g. for a Gaussian prior for the parameter, usually the hyperparameter is the precision, whose prior is then set to a Gamma distribution. But they kind of stealthly switch the concept. In Bayes framework, the joint distribution is then &lt;code lang="eq.latex"&gt;\Pr(\mathcal{D} \mid w) \Pr(w \mid \alpha) \Pr(\alpha)&lt;/code&gt;. They first integrate out the hyperparameter &lt;code lang="eq.latex"&gt;\alpha&lt;/code&gt;, which leaves us &lt;code lang="eq.latex"&gt;\Pr(\mathcal{D} \mid w) \Pr(w)&lt;/code&gt;. The we find the MAP estimation for simplicity, which results in&lt;pre lang="eq.latex"&gt;\min_w -\log \Pr(\mathcal{D} \mid w) - \log \Pr(w).&lt;/pre&gt;Now we may find the second term is of the form of &lt;code lang="eq.latex"&gt;- C\cdot\log (\| w \|^2 + \beta )&lt;/code&gt;, which is a different from the norm regularizer.&lt;br /&gt;&lt;br /&gt;We solve this problem with MM optimization technique. We construct an upper bound for the second term&lt;pre lang="eq.latex"&gt;\log x \leq \log y + \frac{x - y}{y}&lt;/pre&gt;which is a linear function of &lt;code lang="eq.latex"&gt;x&lt;/code&gt; and therefore the optimization problem turns back to the original ``loss + norm regularizer'' form.&lt;br /&gt;&lt;br /&gt;Don't see any thing special from this point of view.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-770611604136672438?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/770611604136672438/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=770611604136672438' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/770611604136672438'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/770611604136672438'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/majorization-minimization-algorithm-for.html' title='A Majorization-Minimization Algorithm for (Multiple) Hyperparameters Learning'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-8270733302571970630</id><published>2009-07-02T15:21:00.004+08:00</published><updated>2009-07-02T17:15:23.906+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='roughly scanned'/><category scheme='http://www.blogger.com/atom/ns#' term='information theory'/><category scheme='http://www.blogger.com/atom/ns#' term='clustering'/><title type='text'>Information Theoretic Measures for Clustering Comparisons: Is a Correction for Chance Nessary?</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.cs.mcgill.ca/%7Eicml2009/papers/10.pdf"&gt;Nguyen Xuan Vinh, Julien Epps and James Bailey&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;One problem in clustering is how to measure two clustering result. E.g. given two clustering results, which one is more closed to the true clustering?&lt;br /&gt;&lt;br /&gt;There are many analytical methods, including Rand Index (RI) and Adjusted Rand Index (ARI). They are based on contingency table. Although RI could be useful since it is 1 iff the two clustering are identical and 0 when no pair are in the same cluster, it can not extinguish a random partition with a ``good clustering''.&lt;br /&gt;&lt;br /&gt;To correct this (why it is called correction for chance), ARI is proposed. Another class comes from information theory. They use the mutual information of proportion of two clustering (each clustering's proportion is a density, therefore the mutual information is the common information in the two clustering). As we know we can create a distance based on entropy &lt;code lang="eq.latex"&gt;D(p, q) = H(p, q) - I(p, q) = H(p) + H(q) - 2 I(p, q)&lt;/code&gt; or its normalized version &lt;code lang="eq.latex"&gt;\mathrm{NMI}(p, q) = \frac{I(p, q)}{\sqrt{H(p) H(q)}}&lt;/code&gt;.&lt;br /&gt;&lt;br /&gt;The authors proposed a corrected-for-chance version based on information method and from their experiments, we may find it works pretty well. Their criterion is&lt;pre lang="eq.latex"&gt;\mathrm{AMI}(U, V) = \frac{I(U, V) - \mathbb{E}( I(M) \mid a, b)}{\sqrt{H(U) H(V)} - \mathbb{E}(I(M) \mid a, b)}&lt;/pre&gt;or&lt;pre lang="eq.latex"&gt;\mathrm{AVI} = \frac{2 I(U, V) - 2 \mathbb{E}(I(M) \mid a, b)}{H(U) + H(V) - 2 \mathbb{E}(I(M) \mid a, b)}&lt;/pre&gt;where &lt;code lang="eq.latex"&gt;\mathbb{E}(I(M) \mid a, b)&lt;/code&gt; is computed from the contingency table.&lt;br /&gt;&lt;br /&gt;The author provided examples where the correction is necessary. Esp. when samples in each cluster is few.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-8270733302571970630?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/8270733302571970630/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=8270733302571970630' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8270733302571970630'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8270733302571970630'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/information-theoretic-measures-for.html' title='Information Theoretic Measures for Clustering Comparisons: Is a Correction for Chance Nessary?'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5121988705137925528</id><published>2009-07-02T10:11:00.003+08:00</published><updated>2009-07-02T15:19:39.449+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='kernel'/><title type='text'>Characteristic Kernels on Groups and Semigroups</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://books.nips.cc/papers/files/nips21/NIPS2008_0458.pdf"&gt;Kenji Fukumizu, Bharath Sriperumbudur, Arthur Gretton and Bernhard Scholkopf&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;First let's see some key ideas in kernel-based methods for r.v.s:&lt;br /&gt;To see whether two r.v.s have the same distribution, we test the mean of arbitrary function values for two r.v.s. The leads to distances between two means in the RKHS &lt;code lang="eq.latex"&gt;\mathcal{H}&lt;/code&gt;. Hence we introduce an map of the r.v. &lt;code lang="eq.latex"&gt;X&lt;/code&gt; into the linear functional of a RKHS &lt;code lang="eq.latex"&gt;\mathbb{E}_X K(\cdot, X)&lt;/code&gt;. Then for arbitrary function &lt;code lang="eq.latex"&gt;f \in \mathcal{H}&lt;/code&gt;, the mean value is &lt;code lang="eq.latex"&gt;\mathbb{E} \langle K(\cdot, X), f(\cdot) = \mathcal{E} f(X)&lt;/code&gt;. Given two r.v.s &lt;code lang="eq.latex"&gt;X, Y&lt;/code&gt;, we can see the discrepancies on any given function &lt;code lang="eq.latex"&gt;f \in \mathcal{H}&lt;/code&gt;, &lt;code lang="eq.latex"&gt;\mathrm{MMD}(X, Y; \mathcal{H}) = \max_{f \in \mathcal{H}} \E f(X) - \E f(Y)&lt;/code&gt;. In practice we calculate the empirical squared version.&lt;br /&gt;&lt;br /&gt;The critical problem of this method is the mapping from the r.v. to the linear functional is unique. Given that the conjugate space of the RKHS is simply itself, we can also interpret this as the uniqueness of the mapping from the r.v. to the feature space (RKHS) is unique. Please notice the RKHS is induced by the kernel and their relationship is one-to-one. Hence the thing that matters is what kind of kernel might cause the uniqueness of the mapping.&lt;br /&gt;&lt;br /&gt;This is explored in this paper, the so-called characteristic kernel, which has the property that the induced mapping is injective. With this kind of kernel, criterions such as MMD will be a well-defined distance between two r.v.s (or measures as in the paper). Here are some main results:&lt;br /&gt;Lemma 1. Let &lt;code lang="eq.latex"&gt;(\Omega, \mathcal{B})&lt;/code&gt; be a measurable space, &lt;code lang="eq.latex"&gt;k&lt;/code&gt; be a measurable, postive definite kernel on &lt;code lang="eq.latex"&gt;\Omega&lt;/code&gt; and &lt;code lang="eq.latex"&gt;\mathcal{H}&lt;/code&gt; be the corresponding RKHS. Then &lt;code lang="eq.latex"&gt;k&lt;/code&gt; is characteristic iff &lt;code lang="eq.latex"&gt;\mathcal{H} \oplus \mathbb{R}&lt;/code&gt; is dense in &lt;code lang="eq.latex"&gt;L^2(\mu)&lt;/code&gt; for every probability &lt;code lang="eq.latex"&gt;\mu&lt;/code&gt; on &lt;code lang="eq.latex"&gt;(\Omega, \mathcal{B})&lt;/code&gt;.&lt;br /&gt;&lt;br /&gt;A positive definite function &lt;code lang="eq.latex"&gt;\phi&lt;/code&gt; is one such that the induced kernel &lt;code lang="eq.latex"&gt;k(x, y) = \phi(y^{-1} x)&lt;/code&gt; is positive definite. Here the inverse is defined on a group. E.g. for &lt;code lang="eq.latex"&gt;\mathbb{R}^n&lt;/code&gt; the linear space, the group is simply defined by the vector addition, therefore we get kernels in the form of &lt;code lang="eq.latex"&gt;k(x, y) = \phi(x - y)&lt;/code&gt; which is simply the so-called RBF. Gaussian kernels, Laplacian kernels are both in this domain. They are called shift-invariant kernels. Bochner theorem characterizes all the shift-invariant kernels defined on &lt;code lang="eq.latex"&gt;\mathbb{R}^n&lt;/code&gt;: all these functions are characteristic function (Fourier transform) of a given Borel measure.&lt;br /&gt;&lt;br /&gt;With some analysis they come to the result if the shift-invariant kernel induced by positive definite function &lt;code lang="eq.latex"&gt;\phi&lt;/code&gt; is characteristic, the corresponding Borel measure has &lt;code lang="eq.latex"&gt;\mathbb{R}^n&lt;/code&gt; as its support or we may say the inverse Fourier transform of &lt;code lang="eq.latex"&gt;\phi&lt;/code&gt; has &lt;code lang="eq.latex"&gt;\mathbb{R}^n&lt;/code&gt; as its support.&lt;br /&gt;&lt;br /&gt;This paper takes the result furthur onto more general algebraic structures, i.e. groups and semi-groups. Their claim is as long as we might find an analog of Fourier transform, we might have something similar to Bochner's theorem. So the algebraic structure is a locally-compact Abelian group.&lt;br /&gt;&lt;br /&gt;The paper also consider non-Abelian groups and semi-groups. They get similar results. It's very theoretical... hmm... not enough mathematical preparation though.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5121988705137925528?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5121988705137925528/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5121988705137925528' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5121988705137925528'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5121988705137925528'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/07/characteristic-kernels-on-groups-and.html' title='Characteristic Kernels on Groups and Semigroups'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-7391597294615596304</id><published>2009-05-17T19:46:00.004+08:00</published><updated>2009-05-18T08:37:52.758+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='feature selection'/><category scheme='http://www.blogger.com/atom/ns#' term='kernel'/><title type='text'>Supervised Feature Selection via Dependence Estimation</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.machinelearning.org/proceedings/icml2007/papers/244.pdf"&gt;Le Song, Alex Smola, Arthur Gretton, Karsten M. Borgwardt and Justin Bedo&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;Feature selection has been mentioned in a &lt;a href="http://paperscanner.blogspot.com/2009/03/dimensionality-reduction-for-supervised.html"&gt;KDR paper&lt;/a&gt;. This paper uses HSIC and backward greedy algorithm. That's each time they pick up a dimension to throw away while the rest has the highest HSIC. I thought they might throw away the one with lowest HSIC but they enjoyed using HSIC as an lower bound of dependence.&lt;br /&gt;&lt;br /&gt;The paper also mentions the connection between HSIC and other criterion such as MMD in &lt;a href="http://paperscanner.blogspot.com/2009/05/kernel-approach-to-comparing.html"&gt;a prevously scanned paper&lt;/a&gt; and KTA (it looks like an uncentered version of HSIC).&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-7391597294615596304?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/7391597294615596304/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=7391597294615596304' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7391597294615596304'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7391597294615596304'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/05/supervised-feature-selection-via.html' title='Supervised Feature Selection via Dependence Estimation'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-8984328591637216415</id><published>2009-05-17T14:54:00.007+08:00</published><updated>2009-05-17T15:50:05.077+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='graph embedding'/><category scheme='http://www.blogger.com/atom/ns#' term='dimension reduction'/><category scheme='http://www.blogger.com/atom/ns#' term='unsupervised learning'/><category scheme='http://www.blogger.com/atom/ns#' term='clustering'/><category scheme='http://www.blogger.com/atom/ns#' term='manifold learning'/><title type='text'>K-means Clustering via Principal Component Analysis</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://ranger.uta.edu/%7Echqding/papers/KmeansPCA1.pdf"&gt;Cris Ding and Xiaofeng He&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;Th most important result of this paper is that principal components are the continuous solutions to the discrete cluster membership indicators for &lt;span style="font-style: italic;"&gt;K&lt;/span&gt;-means clustering and the subspace spanned by the cluster centroids are given by the spectral expansion of the data covariance matrix truncated at &lt;span style="font-style: italic;"&gt;K&lt;/span&gt;-1 terms.&lt;br /&gt;&lt;br /&gt;I'd like to say their discovery is very similar to that of normalized cut and the spectral expansion of Laplacian matrix. Let&lt;pre lang="eq.latex"&gt;J_K = \sum_{k = 1}^K \sum_{i \in C_k} \|x_i - m_k \|^2&lt;/pre&gt;be the objective of &lt;span style="font-style: italic;"&gt;K&lt;/span&gt;-means clustering,&lt;pre lang="eq.latex"&gt;d(C_k, C_l) = \sum_{i \in C_k} \sum_{j \in C_l} \| x_i - x_j \|^2&lt;/pre&gt;be the distance between two clusters. It can be shown that&lt;pre lang="eq.latex"&gt;J_K = N \bar{y}^2 - \frac{1}{2} J_D,&lt;/pre&gt;where in 2-way clustering case&lt;pre lang="eq.latex"&gt;J_D = \frac{N_1 N_2}{N} \left( 2 \frac{d(C_1, C_2)}{N_1 N_2} - \frac{d(C_1, C_1)}{N_1^2} - \frac{d(C_2, C_2)}{N_2^2}\right).&lt;/pre&gt;Further we have&lt;pre lang="eq.latex"&gt;\frac{d(C_1, C_2)}{N_1 N_2} = \frac{d(C_1, C_1)}{N_1^2} + \frac{d(C_2, C_2)}{N_2^2} + (m_1 - m_2)^2.&lt;/pre&gt;The important theorem they proved says in 2-way clustering, the continuous solution is given by the leading principal components, those positive in one cluster and negative in the other. The optimal K-means solution in this case can be bounded with the leading spectrum,&lt;pre lang="eq.latex"&gt;n \bar{y}^2 - \lamnda_1 \leq J_{2} \leq n \bar{y}^2.&lt;/pre&gt;If we create a fully connected graph with each weight &lt;code lang="eq.latex"&gt;\frac{1}{N}&lt;/code&gt;, the linearization of the N-cut algorithm is PCA and the N-cut is simply K-means, is it right? The difference is PCA is done in the feature space and spectral clustering employs the embedding space Laplacian eigenmaps learns.&lt;br /&gt;&lt;br /&gt;For K-way clustering, we have similar results. But unlike 2-way clustering, we can not interpret the PC in a similar way. Maybe &lt;a href="http://paperscanner.blogspot.com/2009/02/spectral-matting.html"&gt;the matting paper&lt;/a&gt; has better interpretation. Their second result is about the centroids.&lt;br /&gt;&lt;br /&gt;This result has several implications. Spectral clustering is done in the embedding space learned by Laplacian eigenmap; PCA/K-means is done in the feature space; KPCA/Kernel K-means is done in the kernel-induced space. So if we considering the unsupervised KDR idea, can we find a similar interpretation such as clustering?&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-8984328591637216415?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/8984328591637216415/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=8984328591637216415' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8984328591637216415'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/8984328591637216415'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/05/k-means-clustering-via-principal.html' title='K-means Clustering via Principal Component Analysis'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-5805467916656827384</id><published>2009-05-12T05:07:00.004+08:00</published><updated>2009-05-12T09:35:11.844+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='dimension reduction'/><category scheme='http://www.blogger.com/atom/ns#' term='unsupervised learning'/><title type='text'>Visualizaing Data using t-SNE</title><content type='html'>by Laurens van der Maaten and Geoffrey Hinton&lt;br /&gt;&lt;br /&gt;This paper proposes a variant of SNE. The SNE was previously proposed by Hinton and Roweis. Its idea is to find a projection such that the neighboring probability is preserved: given the coordinates &lt;code lang="eq.latex"&gt;x_i&lt;/code&gt;,  the neighboring probability is&lt;pre lang="eq.latex"&gt;p_{i \mid j} = \frac{g(x_i, x_j)}{\sum_k g_j(x_j, x_k)}.&lt;/pre&gt;In the projection, if we define a similar probability &lt;code lang="eq.latex"&gt;q_{i \mid j}&lt;/code&gt;, we wish to have a most similar distribution of these samples, therefore we intend to minimize&lt;pre lang="eq.latex"&gt;\sum_j \sum_i p_{i \mid j} \log \frac{p_{i \mid j}}{q_{i \mid j}}.&lt;/pre&gt; The original setting is taking &lt;code lang="eq.latex"&gt;g_j(x_i, x_j)&lt;/code&gt; with Gaussian kernel (with different variances). By setting a so-called perplexity, we then binary-search a proper variance for each sample. This perplexity corresponds to the continuous version of effective neighbors. And we compute the gradient of &lt;code lang="eq.latex"&gt;y_i&lt;/code&gt;:&lt;pre lang="eq.latex"&gt;\frac{\partial \mathrm{KL}}{\partial y_j} = 2 \sum_i ( p_{i \mid j} - q_{i \mid j} + p_{j \mid i} - q_{j \mid i})(y_j - y_i).&lt;/pre&gt;The gradient decent is suppllemented with a momentum.&lt;br /&gt;&lt;br /&gt;The so-called t-SNE has two major modification to the SNE model. For one thing, it uses a symmetric &lt;code lang="eq.latex"&gt;p_{i, j}&lt;/code&gt; instead, which is simply &lt;code lang="eq.latex"&gt;(p_{i \md j} + p_{j \mid i}) / 2&lt;/code&gt;, this will simplify the derivatives of the objective function,&lt;pre lang="eq.latex"&gt;\frac{\partial \mathrm{KL}}{\partial y_j} = 4 \sum_i (p_{i, j} - q_{i, j})(y_j - y_i).&lt;/pre&gt;For another thing, we use a different pdf for the projection instead of the Gaussian kernel. The "t" comes from here, since they propose using t-distribution with 1 degree of freedom (therefore a Cauchy distribution). Then the derivatives become&lt;pre lang="eq.latex"&gt;\frac{\partial \mathrm{KL}}{\partial y_j} = 4 \sum_i (p_{i, j} - q_{i,j})(y_j - y_i)(1 + \| y_j - y_i\|^2)^{-1}.&lt;/pre&gt;&lt;br /&gt;&lt;br /&gt;The great thing about t-SNE is its capability to yield a good embedding. But somehow it is a little difficult to increase the embedding dimension. Not sure about it though, doing some experiments with it.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-5805467916656827384?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/5805467916656827384/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=5805467916656827384' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5805467916656827384'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/5805467916656827384'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/05/visualizaing-data-using-t-sne.html' title='Visualizaing Data using t-SNE'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-3138210307858170650</id><published>2009-05-11T13:52:00.004+08:00</published><updated>2009-05-17T16:30:47.852+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='clustering'/><category scheme='http://www.blogger.com/atom/ns#' term='kernel'/><title type='text'>A Dependence Maximaization View of Clustering</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.machinelearning.org/proceedings/icml2007/papers/243.pdf"&gt;Le Song, Alex Smola, Arthur Gretton and Karsten M. Borgwardt&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;HSIC is an upper bound of the COCO which in theory has been proved to be equivalent to independence. Therefore HSIC can be used as an independence test. However, as an upper bound, it might not be suitable if we want to maximize the dependence of r.v.s IMHO. So the motivation of using HSIC for this purpose (clustering) is still vague.&lt;br /&gt;&lt;br /&gt;For clustering, we maximize&lt;pre lang="eq.latex"&gt;\tr HK_xH K_y&lt;/pre&gt;for making features and labels dependent. So we have to find a kernel for the discrete variable &lt;code lang="eq.latex"&gt;y&lt;/code&gt;: delta kernel; or the kernel in the paper,&lt;pre lang="eq.latex"&gt;\Pi A \Pi^\top&lt;/pre&gt;where &lt;code lang="eq.latex"&gt;A&lt;/code&gt; is the Gram matrix for different clusters and &lt;code lang="eq.latex"&gt;\Pi&lt;/code&gt; is the assignment matrix. The paper also consider other kernels, relating this algorithm with kernel PCA (maybe kernel k-means also?).&lt;br /&gt;&lt;br /&gt;The optimization is quite difficult as other clustering algorithms. It initialize an assignment matrix randomly and repeat updating the assignment matrix until it converges. We loop by the samples, finding the best assignment that maximizes the HSIC. Please notice the matrix &lt;code lang="eq.latex"&gt;A&lt;/code&gt; will be updated when the assignment matrix is updated. Like k-means, each sample has the same weight and &lt;code lang="eq.latex"&gt;A&lt;/code&gt; is simply a diagonal matrix with the inverse of number of samples in this cluster. Another choice generalizes the weights. This leads to a family of clustering algorithms by choosing &lt;code lang="eq.latex"&gt;A&lt;/code&gt;.&lt;br /&gt;&lt;br /&gt;Maybe it's worth considering about the designing idea more carefully.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-3138210307858170650?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/3138210307858170650/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=3138210307858170650' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3138210307858170650'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/3138210307858170650'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/05/dependence-maximaization-view-of.html' title='A Dependence Maximaization View of Clustering'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-9099603310391634668</id><published>2009-05-11T12:44:00.003+08:00</published><updated>2009-05-11T13:34:46.298+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='kernel'/><title type='text'>A Kernel Approach to Comparing Distributions</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="www.kyb.tuebingen.mpg.de/publications/attachments/Gretton_4426%5B0%5D.pdf"&gt;Arthur Gretton, Karsten M. Borgwardt, Malte Rasch, Bernhard Scholkopf and Alexander J. Smola&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper takes advantage of kernel method to test whether two r.v.s have the same distribution. The idea is very close to independence test based on kernel method. We know two r.v.s &lt;code lang="eq.latex"&gt;\mathbb{E} f(x) = \mathbb{E} f(y)&lt;/code&gt; if their distribution is identical. Given a large enough function set, we may use&lt;pre lang="eq.latex"&gt;\mathrm{MMD}(\mathcal{F}, x, y ) = \sup_{f \in \mathcal{F}} \mathbb{E} f(x) - \mathbb{E} f(y).&lt;/pre&gt;Then we try some universal kernel and its corresponding RKHS. With some derivation, the empirical evalutation of MMD is based on&lt;pre lang="eq.latex"&gt;\mathrm{MMD}^2(\mathcal{F}, x, y) = \mathbb{E} \langle \phi(x), \phi(x')\rangle + \mathbb{E} \langle \phi(y), \phi(y')\rangle - 2 \mathbb{E} \langle \phi(x), \phi(y)\rangle \approx \frac{1}{m^2} \sum_{i, j = 1}^m k(x_i, x_j) + \frac{1}{n^2} \sum_{i, j = 1}^n k(y_i, y_j) - 2 \frac{1}{mn} \sum_{i = 1}^m \sum_{j = 1}^n k(x_i, y_j).&lt;/pre&gt;&lt;br /&gt;&lt;br /&gt;I am not sure whether a later work in NIPS 2008 is based on this, which will be scanned soon.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-9099603310391634668?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/9099603310391634668/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=9099603310391634668' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/9099603310391634668'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/9099603310391634668'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/05/kernel-approach-to-comparing.html' title='A Kernel Approach to Comparing Distributions'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-7922986540151172805</id><published>2009-05-11T10:22:00.001+08:00</published><updated>2009-05-12T10:27:10.625+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='independent component analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='kernel'/><title type='text'>Measuring Statistical Dependence with Hilbert-Schmidt Norms</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.kyb.mpg.de/publications/attachments/hsicALT05_%5B0%5D.pdf"&gt;Arthur Gretton, Olivier Bousquet, Alex Smola and Berhand Scholkopf&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper uses the so-called Hilbert-Schmidt norms instead of &lt;code lang="eq.latex"&gt;L^2&lt;/code&gt; norm (corresponding to the COCO independence test) in &lt;a href="http://paperscanner.blogspot.com/2009/05/kernel-constrained-covariance-for.html"&gt;the previously scanned paper&lt;/a&gt;, resulting in the so-called HSIC (Hilbert-Schmidt independence criterion), which is in practice much easier to calculate. Compared with COCO, which using the square root of the largest singular value of &lt;code lang="eq.latex"&gt;\tilde{K}^{(x)} \tilde{K}^{(y)}&lt;/code&gt;, HSIC actually uses the whole spectrum, i.e. the trace of &lt;code lang="eq.latex"&gt;\tilde{K}^{(x)} \tilde{K}^{(y)}&lt;/code&gt;. Since the Frobenius norm is an upper bound for the &lt;code lang="eq.latex"&gt;L^2&lt;/code&gt; norm, therefore the similar independence equivalence can be obtained.&lt;br /&gt;&lt;br /&gt;Please notice the relationship of KCC, COCO and HSIC. For KCC, we use the maximum kernel correlation as the criterion for independence test, but then we have to use the regularized version to avoid the common case when it does not give desired result. In COCO, we constrain both covariances to identity and then the the problem becomes seeking the largest singular value of the product of two centered Gram matrices while KCC solves the generalized eigenvalue problem. Therefore when we deal with multiple r.v.s, we can do it in the same ways as CCA does. So in a way you know why it is called COCO. These three ideas are closely related, just as in the KMI paper, KMI and KGV are closely related.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-7922986540151172805?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/7922986540151172805/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=7922986540151172805' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7922986540151172805'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/7922986540151172805'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/05/measuring-statistical-dependence-with.html' title='Measuring Statistical Dependence with Hilbert-Schmidt Norms'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-6245855506070282729</id><published>2009-05-10T22:01:00.000+08:00</published><updated>2009-05-10T21:54:26.713+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='independent component analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='kernel'/><title type='text'>Kernel Constrained Covariance for Dependence Measurement</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://research.microsoft.com/pubs/67360/pdf3174.pdf"&gt;Arthur Gretton, Alexander Smola, Olivier Bousquet, Ralf Herbrich, Andrei Belitski, Mark Augath, Yusuke Murayama, Jon Pauls, Bernhard Scholkopf and Nikos Logothetis&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;For measuring independence, they propose another kernel-based criterion COCO (constrained covariance, in &lt;a href="http://paperscanner.blogspot.com/2009/05/kernel-mutual-information.html"&gt;a previous paper&lt;/a&gt;, it is called kernel covariance, KC, however their objective actually comes from the multual information of Gaussian r.v.s) other than &lt;a href="http://paperscanner.blogspot.com/2009/03/kernel-independent-component-analysis.html"&gt;the previously scanned paper of Jordan's&lt;/a&gt; (KCC and KGV). This has been mentioned in their earlier work.&lt;br /&gt;&lt;br /&gt;This work has quite different concerns. Since previous papers did not say how to choose the kernels. They prove COCO is only zero at independence for universal kernels. The so called universal kernel is a kernel such that on a compact metric space &lt;code lang="eq.latex"&gt;(\mathcal{X}, d)&lt;/code&gt;, the RKHS induced by the kernel &lt;code lang="eq.latex"&gt;k(\cdot, \cdot)&lt;/code&gt; is dense in the continous function space over &lt;code lang="eq.latex"&gt;\mathcal{X}&lt;/code&gt;, namely &lt;code lang="eq.latex"&gt;C(\mathcal{X})&lt;/code&gt; w.r.t. the topology induced by the infinity norm &lt;code lang="eq.latex"&gt;\| f - g \|_\infty&lt;/code&gt;. They proved Gaussian kernels and Laplacian kernels are universal.&lt;br /&gt;&lt;br /&gt;They also point out the limitation of independence tests based on universal kernels. The proposed COCO also has its adversary. That is when we do have small COCO, the r.v.s are still dependent. But they found their calculation  of COCO has an exponential convergence rate to the true COCO value.&lt;br /&gt;&lt;br /&gt;The calculation of COCO is equivalent to&lt;pre lang="eq.latex"&gt;\mathrm{COCO}(z, F, G) = \frac{1}{n}\sqrt{\| \tilde{K}^f \tilde{K}^g \|_2}.&lt;/pre&gt; Their later paper actually uses Frobenius norm. Their experiment shows an example of applying COCO to ICA on fMRI data, compared with KMI and correlation method (only using correlation as a testing of independence).&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-6245855506070282729?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/6245855506070282729/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=6245855506070282729' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6245855506070282729'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6245855506070282729'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/05/kernel-constrained-covariance-for.html' title='Kernel Constrained Covariance for Dependence Measurement'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-4286049335382773934</id><published>2009-05-10T19:29:00.005+08:00</published><updated>2009-05-10T21:37:23.574+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='independent component analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='kernel'/><title type='text'>The Kernel Mutual Information</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://www.kyb.mpg.de/publications/pss/ps2133.ps"&gt;Arthur Gretton, Ralf Herbrich and Alexander J. Smola&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;We know mutual information can be used to test the independence of r.v.s. The difficulty with mutual information is that the distribution is usually unknown, either we have to make a density estimation or entropy estimation from samples.&lt;br /&gt;&lt;br /&gt;This paper introduces the so-called KMI (kernel mutual information), which in practice has comparable performance with KGV (in &lt;a href="http://paperscanner.blogspot.com/2009/03/kernel-independent-component-analysis.html"&gt;Bach and Jordan's paper&lt;/a&gt;). It comes from measuring the mutual information between two multi-variate Gaussian vector,&lt;pre lang="eq.latex"&gt;I(x; y) = -\frac{1}{2} \log \left( \prod_{i = 1}^{\min(p_x, p_y)} (1 - \rho_i^2)\right).&lt;/pre&gt;We use the correlation in the RKHS for this &lt;code lang="eq.latex"&gt;\rho_i&lt;/code&gt;,&lt;pre lang="eq.latex"&gt;\rho_i = \frac{c_i^\top( P_{x, y} - p_x p_y) d_i}{\sqrt{c_i^\top D_x c_i d_i^\top D_y d_i}},&lt;/pre&gt;where &lt;code lang="eq.latex"&gt;P_{x, y}, p_x, p_y, D_x, D_y&lt;/code&gt; are approximated with samples and an assumed grid(it will be cancelled out in the end). By relaxing the denominator (to a bigger value), we find an upper bound for the mutual information,&lt;pre lang="eq.latex"&gt;M(z) = -\frac{1}{2} \log\left( \big| I - (\nu_x \nu_y)\tilde{K}^{(x)} \tilde{K}^{(y)} \big|\right),&lt;/pre&gt; where &lt;code lang="eq.latex"&gt;\tilde{K}^{(x)}&lt;/code&gt; and &lt;code lang="eq.latex"&gt;\tilde{K}^{(y)}&lt;/code&gt; are centered Gram matrices. This is the criterion for KMI. Since it's an upper bound for the mutual information, we can use it as a contast function (I don't know why the authors think we ca derive the result from the theorem 1, confused about their idea).&lt;br /&gt;&lt;br /&gt;In their formulation, (regularized) KGV can be regarded as another way of relaxing the covariance. Therefore likewise this independence measurement can also be used for ICA task. In a way, this paper is a generalization of KGV, but I am still confused about their idea.&lt;br /&gt;&lt;br /&gt;==&lt;br /&gt;later I realized that the KMI is the upper bound of KC, therefore when KMI = 0 the KC will be 0 too.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-4286049335382773934?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/4286049335382773934/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=4286049335382773934' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4286049335382773934'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/4286049335382773934'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/05/kernel-mutual-information.html' title='The Kernel Mutual Information'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-360987930217466003</id><published>2009-04-24T10:01:00.002+08:00</published><updated>2009-04-24T22:53:13.076+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='probabilistic graphical model'/><title type='text'>Isotonic Conditional Random Fields and Local Sentiment Flow</title><content type='html'>&lt;div style="text-align: right;"&gt;by  &lt;a style="font-style: italic;" href="http://books.nips.cc/papers/files/nips19/NIPS2006_0384.pdf"&gt;Yi Mao and Guy Lebanon&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This paper introduces a variant of CRF, adding a constraint by prior knowledge. Since we know for some words, they are representing a positive/negative sentiment. Therefore the corresponding feature &lt;code lang="eq.latex"&gt;f_{(\sigma, w)}&lt;/code&gt;, which means &lt;pre lang="eq.latex"&gt;f_{(\sigma, w)}(y) = \begin{cases} 1 &amp;amp; y = \sigma, x = w \\ 0 &amp;amp; \text{otherwise} \end{cases},&lt;/pre&gt; has a larger/smaller parameter &lt;code lang="eq.latex"&gt;\mu_{(\sigma, w)}&lt;/code&gt;. That is to say, if &lt;code lang="eq.latex"&gt;w&lt;/code&gt; is a special word indicating positive, then &lt;code lang="eq.latex"&gt;\mu_{(\sigma, w)} \geq \mu_{(\sigma', w)}&lt;/code&gt; when &lt;code lang="eq.latex"&gt;\sigma \geq \sigma'&lt;/code&gt;.&lt;br /&gt;&lt;br /&gt;This style of constraints will lead to a convex optimization problem though. The author prove that given a sequence &lt;code lang="eq.latex"&gt;x&lt;/code&gt; and the corresponding labelling &lt;code lang="eq.latex"&gt;y&lt;/code&gt;, letting &lt;code lang="eq.latex"&gt;x' = (x_1, \ldots, x_j \cup \{ w\}, \ldots, x_n)&lt;/code&gt;, if &lt;code lang="eq.latex"&gt;\mu_{(t_j, v)} \geq \mu_{(s_j, v)}&lt;/code&gt;, then&lt;pre lang="eq.latex"&gt;\frac{\Pr(s \mid x)}{\Pr(s \mid x')} \geq \frac{\Pr(t \mid x)}{\Pr(t \mid x')}.&lt;/pre&gt;This gives a new interpretation for the constraints. The model is reparameterized with Mobios inverse theorem (what is that?) and solved.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-360987930217466003?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/360987930217466003/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=360987930217466003' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/360987930217466003'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/360987930217466003'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/04/isotonic-conditional-random-fields-and.html' title='Isotonic Conditional Random Fields and Local Sentiment Flow'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-447099751126587323.post-6753852605428289865</id><published>2009-04-20T20:22:00.013+08:00</published><updated>2009-04-23T23:39:34.751+08:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='tensor'/><title type='text'>Tensor Decomposition and Applications</title><content type='html'>&lt;div style="text-align: right;"&gt;by &lt;a style="font-style: italic;" href="http://csmr.ca.sandia.gov/%7Etgkolda/pubs/bibtgkfiles/TensorReview-preprint.pdf"&gt;Tamara G. Kolda and Brett W. Bader&lt;/a&gt;&lt;br /&gt;&lt;/div&gt;&lt;br /&gt;This is the first systematic paper I read on tensor. A tensor in computer science is a multi-dimensional array (I am still wondering whether it has anything to do with the term tensor in differential geometry).&lt;br /&gt;&lt;br /&gt;So before we talk about tensors, here are several notations we'd like to use in the text:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;The &lt;span style="font-style: italic;"&gt;order&lt;/span&gt; of the tensor is the the number of dimensions: vectors are order-1 tensors, matrices are order-2.&lt;/li&gt;&lt;li&gt;An order one tensor is denoted with &lt;code lang="eq.latex"&gt;\mathbf{a}&lt;/code&gt;, an order-2 tensor with &lt;code lang="eq.latex"&gt;\mathbf{A}&lt;/code&gt; and one of higher-order with &lt;code lang="eq.latex"&gt;\mathcal{A}&lt;/code&gt;.&lt;/li&gt;&lt;li&gt;A &lt;span style="font-style: italic;"&gt;fibre&lt;/span&gt; is an analog of a row/column of a matrix in tensor. A tensor &lt;code lang="eq.latex"&gt;\mathcal{X}&lt;/code&gt; of order &lt;code lang="eq.latex"&gt;n&lt;/code&gt; has a fibre in the form of &lt;code lang="eq.latex"&gt;\mathbf{x}_{i_1, \ldots, i_{j-1}, :,i_{j+1}, \ldots, i_n}&lt;/code&gt; which is the vector obtained by fixing all other index &lt;code lang="eq.latex"&gt;i_k&lt;/code&gt; except one &lt;code lang="eq.latex"&gt;i_j&lt;/code&gt;.&lt;/li&gt;&lt;li&gt;A &lt;span style="font-style: italic;"&gt;slice&lt;/span&gt; of a tensor is obtained by fixing all indices except two, which can be regarded as a plane.&lt;/li&gt;&lt;li&gt;A tensor &lt;code lang="eq.latex"&gt;\mathcal{X} \in \mathbb{R}^{I_1} \times \cdots \times \mathbb{R}^{I_n}&lt;/code&gt; is said to have &lt;span style="font-style: italic;"&gt;rank 1&lt;/span&gt; if &lt;code lang="eq.latex"&gt;\mathcal{X} = \mathbf{a}^{(1)} \circ \cdots \circ \mathbf{a}^{(n)}&lt;/code&gt; where the &lt;code lang="eq.latex"&gt;\circ&lt;/code&gt; operator means &lt;code lang="eq.latex"&gt;x_{i_1, \ldots, i_n} = a_{i_1}^{(1)}\cdots a_{i_n}^{(n)}&lt;/code&gt;.&lt;/li&gt;&lt;li&gt;&lt;span style="font-style: italic;"&gt;Matricization&lt;/span&gt; (a.k.a. &lt;span style="font-style: italic;"&gt;unfolding&lt;/span&gt;, &lt;span style="font-style: italic;"&gt;flattening&lt;/span&gt;) is the process of reordering the tensor data into a matrix. The mode-&lt;code lang="eq.latex"&gt;k&lt;/code&gt; unfolding of the order-&lt;code lang="eq.latex"&gt;n&lt;/code&gt; tensor &lt;code lang="eq.latex"&gt;\mathcal{X}&lt;/code&gt; is denoted as &lt;code lang="eq.latex"&gt;\mathbf{X}_{(n)} \in \mathbb{R}^{I_k\times (\prod_{s\neq k} I_s)}&lt;/code&gt;.&lt;/li&gt;&lt;li&gt;The &lt;code lang="eq.latex"&gt;k&lt;/code&gt;-mode product of a tensor &lt;code lang="eq.latex"&gt;\mathcal{X}&lt;/code&gt; and a matrix &lt;code lang="eq.latex"&gt;U \in \mathbb{R}^{J \times I_k}&lt;/code&gt; is&lt;pre lang="eq.latex"&gt;(\mathcal{X} \times_k \mathbf{U})_{i_1, \ldots, j, \ldots, i_n} = \sum_{i_k = 1}^{I_k} x_{i_1, \ldots, x_n} u_{j, i_k}.&lt;/pre&gt;&lt;/li&gt;&lt;li&gt;The &lt;span style="font-style: italic;"&gt;Kronecker product&lt;/span&gt; of two matrices &lt;code lang="eq.latex"&gt;\mathbf{A}\in \mathbb{R}^{I \times J}, \mathbf{B} \in \mathbb{R}^{K \times L}&lt;/code&gt; is&lt;pre lang="eq.latex"&gt;\mathbf{A} \otimes \mathbf{B} = \begin{pmatrix} a_{1, 1} \mathbf{B} &amp;amp; \cdots &amp;amp; a_{1, J}\mathbf{B} \\ \vdots &amp;amp; \ddots &amp;amp; \vdots \\ a_{I, 1} \mathbf{B} &amp;amp; \cdots &amp;amp; a_{I, J} \mathbf{B} \end{pmatrix}.&lt;/pre&gt;&lt;/li&gt;&lt;li&gt;The &lt;span style="font-style: italic;"&gt;Khatri-Rao product&lt;/span&gt; of the two matrices &lt;code lang="eq.latex"&gt;\mathbf{A} \in \mathbb{R}^{I \times K}, \mathbf{B} \in \mathbb{R}^{J \times K}&lt;/code&gt; is&lt;pre lang="eq.latex"&gt;\mathbf{A} \odot \mathbf{B} = \begin{pmatrix} \mathbf{a}_1 \otimes \mathbf{b}_1 &amp;amp; \cdots &amp;amp; \mathbf{a}_K \otimes \mathbf{b}_K \end{pmatrix}.&lt;/pre&gt;&lt;/li&gt;&lt;li&gt;The Hadmard product is the elementwise product of two matrices &lt;code lang="eq.latex"&gt;\mathbf{A}, \mathbf{B} \in \mathbb{R}^{I, J}&lt;/code&gt;, i.e.&lt;pre lang="eq.latex"&gt;\mathbf{A} * \mathbf{B} = \begin{pmatrix} a_{1,1} b_{1, 1} &amp;amp; \cdots &amp;amp; a_{1, J} b_{1, J} \\ \vdots &amp;amp; \ddots &amp;amp; \vdots \\ a_{I,1} b_{I, 1} &amp;amp; \cdots &amp;amp; a_{I, J} b_{I, J}\end{pmatrix}.&lt;/pre&gt;&lt;/li&gt;&lt;li&gt;The so-called CP decomposition (canonical decomposition, parallel factors) of a tensor is a sum of rank 1 components&lt;pre lang="eq.latex"&gt;\mathcal{X} \approx \sum_{r = 1}^R \mathbf{a}_r^{(1)} \circ \cdots \circ \mathbf{a}_r^{(n)}.&lt;/pre&gt;This is usually denoted with&lt;pre lang="eq.latex"&gt;\mathcal{X} \approx [\![ \mathbf{\lambda}; \mathbf{A}^{(1)}, \ldots, \mathbf{A}^{(n)} ]\!] \stackrel{\triangle}{=} \sum_{r = 1}^R \lambda_r \mathbf{a}_r^{(1)} \circ \cdots \circ \mathbf{a}_r^{(n)}.&lt;/pre&gt;where &lt;code lang="eq.latex"&gt;\mathbf{A}^{(i)}&lt;/code&gt; has orthonormalized vectors.&lt;br /&gt;&lt;/li&gt;&lt;/ul&gt;&lt;br /&gt;However low-rank approximation in tensor is not as well defined as that of matrices. One thing is there may not exist a rank-2 tensor which is nearest to a rank-3 tensor. Although there are various bounds for the rank, it is still not clear how to estimate the rank of a tensor. Therefore it will be difficult for us to find a good low rank approximation.&lt;br /&gt;&lt;br /&gt;The most popular algorithm for CP decomposition is ALS (alternating least square). This is very simple, by fixing all &lt;code lang="eq.latex"&gt;\mathbf{A}^{(i)}&lt;/code&gt; except &lt;code lang="eq.latex"&gt;\mathbf{A}^{(k)}&lt;/code&gt;. With the &lt;code lang="eq.latex"&gt;k&lt;/code&gt;-mode unfolding, we will approximate it by tuning &lt;code lang="eq.latex"&gt;\mathbf{A}^{(k)}&lt;/code&gt;, which leads to the following least square problem,&lt;pre lang="eq.latex"&gt;\min_{\hat{\mathbf{A}}} \| \mathbf{X}_{(k)} - \hat{\mathbf{A}} (\mathbf{A}_{(1)} \odot \cdots \odot \mathbf{A}_{(k-1)} \odot \mathbf{A}_{(k+1)} \odot \cdots \odot \mathbf{A}_{(n)})^\top\|,&lt;/pre&gt;with least square solution, we have&lt;pre lang="eq.latex"&gt;\hat{\mathbf{A}} = \mathbf{X}_{(1)} [ (\mathbf{A}_{(1)} \odot \cdots \odot \mathbf{A}_{(k-1)} \odot \mathbf{A}_{(k+1)} \odot \cdots \odot \mathbf{A}_{(n)})^\top ]^\dagger.&lt;/pre&gt;Please notice that &lt;code lang="eq.latex"&gt;(\mathbf{A} \odot \mathbf{B})^\top (\mathbf{A} \odot \mathbf{B}) = (\mathbf{A}^\top \mathbf{A}) * (\mathbf{B}^\top \mathbf{B})&lt;/code&gt;, therefore&lt;pre lang="eq.latex"&gt;\hat{\mathbf{A}} = \mathbf{X}_{(1)} (\mathbf{A}_{(1)} \odot \cdots \odot \mathbf{A}_{(k-1)} \odot \mathbf{A}_{(k+1)} \odot \cdots \odot \mathbf{A}_{(n)})[(\mathbf{A}_{(1)}^\top \mathbf{A}_{(1)} * \cdots * \mathbf{A}_{(k-1)}^\top \mathbf{A}_{(k-1)} * \mathbf{A}_{(k+1)}^\top \mathbf{A}_{(k+1)} * \cdots * \mathbf{A}_{(n)}^\top \mathbf{A}_{(n)})]^\dagger.&lt;/pre&gt;The values of &lt;code lang="eq.latex"&gt;\lambda&lt;/code&gt; is updated with the norm of the columns of &lt;code lang="eq.latex"&gt;\hat{\mathbf{A}}&lt;/code&gt; and &lt;code lang="eq.latex"&gt;\mathbf{A}_{(k)}&lt;/code&gt; is the normalized &lt;code lang="eq.latex"&gt;\hat{\mathbf{A}}&lt;/code&gt;.&lt;br /&gt;&lt;br /&gt;Another important problem with tensor is Tucker decomposition (a.k.a. HOSVD, high order SVD): we seek to find&lt;pre lang="eq.latex"&gt;\mathcal{X} \approx \mathcal{G} \times_1 \mathbf{A}^{(1)} \times_2 \cdots \times_{n} \mathbf{A}^{(n)}.&lt;/pre&gt;Usually we need a degenerate version of HOSVD, which relates to the &lt;code lang="eq.latex"&gt;k&lt;/code&gt;-rank approximation. This problem can also be solved with ALS.&lt;br /&gt;&lt;br /&gt;There are a lot of decompositions related to tensor:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;INDSCAL (indivisual differences in scaling), for order-3 tensors where the first two dimensions are symmetric.&lt;/li&gt;&lt;li&gt;PARAFAC2 (parallel factorization) is, strictly speaking, not a problem of tensor decomposition. It seeks to find an approximation of several matrix &lt;code lang="eq.latex"&gt;\mathbf{X}_k&lt;/code&gt;, each is approximated with &lt;code lang="eq.latex"&gt;\mathbf{U}_k \mathbf{S}_k \mathbf{V}^\top&lt;/code&gt;, where &lt;code lang="eq.latex"&gt;\mathbf{U}_k&lt;/code&gt; and &lt;code lang="eq.latex"&gt;\mathbf{V}&lt;/code&gt; are orthogonal matrices and &lt;code lang="eq.latex"&gt;\mathbf{S}_k&lt;/code&gt; are diagonal matrices.&lt;/li&gt;&lt;li&gt;CANDELINC (canonical decomposition with linear constraints), in which we know the subspace of each dimension, can be turned into a standard CP decomposition this way.&lt;/li&gt;&lt;li&gt;DEDICOM (decomposition into directional components), seeks approximation of an asymmetric matrix &lt;code lang="eq.latex"&gt;\mathbf{X} = \mathbf{A} \mathbf{R} \mathbf{A}^\top&lt;/code&gt;.&lt;/li&gt;&lt;li&gt;PARATUCK2 seeks to find an approximation of &lt;code lang="eq.latex"&gt;\mathcal{X}&lt;/code&gt;,&lt;pre lang="eq.latex"&gt;\mathbf{X}_k \approx \mathbf{A} \mathbf{D}_k^A \mathbf{R} \mathbf{D}_k^B \mathbf{B}&lt;/pre&gt; where &lt;code lang="eq.latex"&gt;\mathbf{A}, \mathbf{B}&lt;/code&gt; are matrices and &lt;code lang="eq.latex"&gt;\mathbf{D}_k^A, \mathbf{D}_k^B&lt;/code&gt; are diagonal matrices.&lt;/li&gt;&lt;li&gt;NTF (nonnegative tensor factorization), on each dimension, a NMF is applied instead of PCA.&lt;br /&gt;&lt;/li&gt;&lt;/ul&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/447099751126587323-6753852605428289865?l=paperscanner.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://paperscanner.blogspot.com/feeds/6753852605428289865/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=447099751126587323&amp;postID=6753852605428289865' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6753852605428289865'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/447099751126587323/posts/default/6753852605428289865'/><link rel='alternate' type='text/html' href='http://paperscanner.blogspot.com/2009/04/tensor-decomposition-and-aplications.html' title='Tensor Decomposition and Applications'/><author><name>demonstrate</name><uri>http://www.blogger.com/profile/07866269874003406275</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='22' height='32' src='http://foto.yculblog.com/photo/d/demonstrate/heli.png'/></author><thr:total>0</thr:total></entry></feed>
