// API callback
related_results_labels_thumbs({"version":"1.0","encoding":"UTF-8","feed":{"xmlns":"http://www.w3.org/2005/Atom","xmlns$openSearch":"http://a9.com/-/spec/opensearchrss/1.0/","xmlns$blogger":"http://schemas.google.com/blogger/2008","xmlns$georss":"http://www.georss.org/georss","xmlns$gd":"http://schemas.google.com/g/2005","xmlns$thr":"http://purl.org/syndication/thread/1.0","id":{"$t":"tag:blogger.com,1999:blog-2868824907842590784"},"updated":{"$t":"2015-07-13T04:34:25.871-07:00"},"category":[{"term":"industry"},{"term":"learn"},{"term":"architecture"},{"term":"vendors"},{"term":"solutions"},{"term":"aprilfool"},{"term":"high5"},{"term":"influencers"},{"term":"quiz"},{"term":"training"}],"title":{"type":"text","$t":"hadoopsphere.com"},"subtitle":{"type":"html","$t":""},"link":[{"rel":"http://schemas.google.com/g/2005#feed","type":"application/atom+xml","href":"http:\/\/www.hadoopsphere.com\/feeds\/posts\/default"},{"rel":"self","type":"application/atom+xml","href":"http:\/\/www.blogger.com\/feeds\/2868824907842590784\/posts\/default\/-\/industry?alt=json-in-script\u0026max-results=4"},{"rel":"alternate","type":"text/html","href":"http:\/\/www.hadoopsphere.com\/search\/label\/industry"},{"rel":"hub","href":"http://pubsubhubbub.appspot.com/"},{"rel":"next","type":"application/atom+xml","href":"http:\/\/www.blogger.com\/feeds\/2868824907842590784\/posts\/default\/-\/industry\/-\/industry?alt=json-in-script\u0026start-index=5\u0026max-results=4"}],"author":[{"name":{"$t":"Desk Editor"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"http:\/\/img2.blogblog.com\/img\/b16-rounded.gif"}}],"generator":{"version":"7.00","uri":"http://www.blogger.com","$t":"Blogger"},"openSearch$totalResults":{"$t":"56"},"openSearch$startIndex":{"$t":"1"},"openSearch$itemsPerPage":{"$t":"4"},"entry":[{"id":{"$t":"tag:blogger.com,1999:blog-2868824907842590784.post-6680837454101243001"},"published":{"$t":"2015-05-08T11:05:00.000-07:00"},"updated":{"$t":"2015-05-08T11:22:09.651-07:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"industry"}],"title":{"type":"text","$t":"Low latency SQL querying on HBase"},"content":{"type":"html","$t":"\u003Cdiv dir=\"ltr\" style=\"text-align: left;\" trbidi=\"on\"\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-YfhOr2RLjAc\/VUz4KqJNGWI\/AAAAAAAAAcU\/nmBd12k11W4\/s1600\/nosql-sql-hadoopsphere.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"132\" src=\"http:\/\/3.bp.blogspot.com\/-YfhOr2RLjAc\/VUz4KqJNGWI\/AAAAAAAAAcU\/nmBd12k11W4\/s320\/nosql-sql-hadoopsphere.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EHBase has emerged as one of the most popular NoSQL database offering distributed, versioned, non-relational tables hosted on commodity hardware. However, with a large set of users coming from a relational SQL world, it made sense to bring the SQL back in this NoSQL. With Apache Phoenix, database professionals get a convenient way to query HBase through SQL in a fast and efficient manner. Continuing our \u003Ca href=\"http:\/\/www.hadoopsphere.com\/2015\/05\/sql-on-hbase-with-apache-phoenix.html\"\u003Ediscussion with James Taylor\u003C\/a\u003E, the founder of Apache Phoenix, we focus on the functional aspects of Phoenix in this second part of interaction.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Ch2 style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EAlthough Apache Phoenix started off with distinct low latency advantage, have the other options like Hive\/Impala (integrated with HBase) caught up in terms of performance?\u003C\/span\u003E\u003C\/h2\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003ENo, these other tools such as Hive and Impala have not invested in improving performance against HBase data, so if anything, Phoenix's advantage has only gotten bigger as our performance improves.\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003ESee \u003Ca href=\"http:\/\/phoenix.apache.org\/performance.html#Phoenix_vs_Impala_running_over_HBase\" rel=\"nofollow\" target=\"_blank\"\u003Ethis link for comparison\u003C\/a\u003E\u0026nbsp;of Apache Phoenix with Apache Hive and Cloudera Impala.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Ctable align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"margin-left: auto; margin-right: auto; text-align: center;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd style=\"text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-Uyi2vSWG4fg\/VUz4EeJVe9I\/AAAAAAAAAcM\/WiFKYXWYHxo\/s1600\/PhoenixVsImpala.png\" imageanchor=\"1\" style=\"margin-left: auto; margin-right: auto;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/4.bp.blogspot.com\/-Uyi2vSWG4fg\/VUz4EeJVe9I\/AAAAAAAAAcM\/WiFKYXWYHxo\/s1600\/PhoenixVsImpala.png\" \/\u003E\u003C\/a\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003Ctd class=\"tr-caption\" style=\"text-align: center;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: small;\"\u003EApache Phoenix and Cloudera Impala comparison\u003Cbr \/\u003E(Query:\u0026nbsp;\u003Cspan style=\"background-color: white; color: #2c3e50; line-height: 21px; text-align: start;\"\u003Eselect count(1) from table over 1M and 5M rows)\u003C\/span\u003E\u003C\/span\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Ch2 style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EWhat lies ahead on roadmap for Apache Phoenix in 2015?\u003C\/span\u003E\u003C\/h2\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EOur upcoming 4.4 release introduces a number of new features: \u0026nbsp;User Defined Functions, UNION ALL support, Spark integration, Query Server to support thin (and eventually non Java) clients, Pherf tool for testing at scale, MR-based index population, and support for HBase 1.0.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EWe are also actively working on transaction support by integrating with Tephra (http:\/\/tephra.io\/). If all goes according to plan, we'll release this after our 4.4 release (in 4.5 or 5.0), as this work is pretty far along (check out our \u003Ca href=\"http:\/\/s.apache.org\/IMM\" rel=\"nofollow\" target=\"_blank\"\u003Etxn branch\u003C\/a\u003E to play around with it).\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EIn parallel with this, we're working on Apache Calcite integration to improve interop with the greater Hadoop ecosystem through plugging into a rich cost-based optimizer framework. IMHO, this is the answer to ubiquitous usage of Phoenix for HBase data across queries that get data from any other Calcite adapter source (RDBMS, Hive, Drill, Kylin, etc.). This will allow a kind of plug and play approach with this the push down being decided based on a common cost model that all these other tools plug into.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003ECome hear more and see a demo at our \u003Ca href=\"http:\/\/www.meetup.com\/San-Francisco-Apache-Phoenix-Meetup\/\" rel=\"nofollow\" target=\"_blank\"\u003Eupcoming Meetups\u003C\/a\u003E or at HBaseCon 2015.\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Ch2 style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EDoes Apache Phoenix also talk to HCatalog or is that interaction left off to HBase itself?\u003C\/span\u003E\u003C\/h2\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EPhoenix manages its metadata through a series of internal HBase tables. It has no interaction with HCatalog.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Ch2 style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003ECan Apache Phoenix be connected with BI tools which have traditionally relied on ODBC drivers?\u003C\/span\u003E\u003C\/h2\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EPhoenix can connect with BI tools that support a JDBC driver. However, BI tools that rely on an ODBC driver are more challenging. There's a new thin driver plus query server model that we support in our upcoming 4.4 release which will help, though. This thin driver will open the door for an ODBC driver to be achievable by writing the same protocol that our Java-based thin driver use (JSON over http).\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Ch2 style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EWhich commercial distributions is Apache Phoenix part of?\u0026nbsp;\u003C\/span\u003E\u003C\/h2\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EApache Phoenix is available in the Hortonworks HDP distribution. Make sure to let your vendor of choice know that you'd like to see Phoenix included in their distribution as well, as that's what will make it happen.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cbr \/\u003E\u003Cdiv style=\"text-align: right;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif;\"\u003E\u003Ci\u003E\u003Ca href=\"http:\/\/www.hadoopsphere.com\/2015\/05\/sql-on-hbase-with-apache-phoenix.html\"\u003E\u0026lt;\u0026lt; SQL on HBase with Apache Phoenix\u003C\/a\u003E\u003C\/i\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: right;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif;\"\u003E\u003Ci\u003E\u003Cbr \/\u003E\u003C\/i\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Ctable bgcolor=\"#CCCCCC\" style=\"text-align: justify;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-ZBIz6RSNp_Q\/VUpttNxNBsI\/AAAAAAAAAbw\/ylbcy0wHBCw\/s1600\/James_S-square.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cimg border=\"0\" height=\"200\" src=\"http:\/\/2.bp.blogspot.com\/-ZBIz6RSNp_Q\/VUpttNxNBsI\/AAAAAAAAAbw\/ylbcy0wHBCw\/s1600\/James_S-square.png\" width=\"200\" \/\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif;\"\u003E\u003Cb\u003EJames Taylor\u003C\/b\u003E is an architect at salesforce.com in the Big Data Group. He founded the Apache Phoenix project and leads its on-going development efforts. Prior to Salesforce, James worked at BEA Systems on projects such as a federated query processing system and a SQL-based complex event programming platform, and has worked in the computer industry for the past 20+ years at various start-ups. He lives with his wife and two daughters in San Francisco.\u003C\/span\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"http:\/\/www.hadoopsphere.com\/feeds\/6680837454101243001\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"http:\/\/www.hadoopsphere.com\/2015\/05\/low-latency-sql-querying-on-hbase.html#comment-form","title":"0 Comments"},{"rel":"edit","type":"application/atom+xml","href":"http:\/\/www.blogger.com\/feeds\/2868824907842590784\/posts\/default\/6680837454101243001"},{"rel":"self","type":"application/atom+xml","href":"http:\/\/www.blogger.com\/feeds\/2868824907842590784\/posts\/default\/6680837454101243001"},{"rel":"alternate","type":"text/html","href":"http:\/\/www.hadoopsphere.com\/2015\/05\/low-latency-sql-querying-on-hbase.html","title":"Low latency SQL querying on HBase"}],"author":[{"name":{"$t":"Hadoop Reporter"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/08867930107048620070"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"http:\/\/img2.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"http:\/\/3.bp.blogspot.com\/-YfhOr2RLjAc\/VUz4KqJNGWI\/AAAAAAAAAcU\/nmBd12k11W4\/s72-c\/nosql-sql-hadoopsphere.png","height":"72","width":"72"},"thr$total":{"$t":"0"}},{"id":{"$t":"tag:blogger.com,1999:blog-2868824907842590784.post-118029770767648080"},"published":{"$t":"2015-03-25T07:05:00.000-07:00"},"updated":{"$t":"2015-03-25T19:30:40.908-07:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"industry"}],"title":{"type":"text","$t":"Governance in a data lake"},"content":{"type":"html","$t":"\u003Cdiv dir=\"ltr\" style=\"text-align: left;\" trbidi=\"on\"\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-kKDI5-kGWP0\/VRK8cr580-I\/AAAAAAAAAaY\/MvP9_e-3lr0\/s1600\/apache-falcon-governance-hadoopsphere.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/1.bp.blogspot.com\/-kKDI5-kGWP0\/VRK8cr580-I\/AAAAAAAAAaY\/MvP9_e-3lr0\/s1600\/apache-falcon-governance-hadoopsphere.png\" height=\"181\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EThe need for defining a robust data governance layer is becoming an essential requirement for an\u0026nbsp;enterprise data lake. Continuing our discussion on data governance, we focus on Apache Falcon as a solution option for governing the data pipelines. HadoopSphere discussed with\u0026nbsp;Srikanth Sundarrajan, VP of Apache Falcon, about the product as well as the data governance requirements. In the \u003Ca href=\"http:\/\/www.hadoopsphere.com\/2015\/03\/data-pipelines-with-apache-falcon.html\"\u003Efirst part of the interview, we talked about Falcon's architecture\u003C\/a\u003E. We further discuss the functional aspects in the interaction below.\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Ch2 style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EWhat lies ahead on the roadmap of Apache Falcon for 2015?\u003C\/span\u003E\u003C\/h2\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EMajor focus areas for Apache Falcon in 2015 and beyond:\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E•\u003Cspan class=\"Apple-tab-span\" style=\"white-space: pre;\"\u003E \u003C\/span\u003E\u003Cb\u003EEntity management and Instance administration dashboard\u003C\/b\u003E – Currently CLI based administration is very limiting and the real power of the dependency information available within Falcon can’t be unlocked without an appropriate visual interface. Also entity management complexities can be cut down through a friendlier UI.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E•\u003Cspan class=\"Apple-tab-span\" style=\"white-space: pre;\"\u003E \u003C\/span\u003E\u003Cb\u003ERecipes\u003C\/b\u003E – Today Falcon supports notion of a process to perform some action over data. But there are standard and routine operations that may be applicable for a wide range of users. Falcon project is currently working on enabling this through the notion of recipe. This will enable users to convert their standard data routines into templates for reuse and more importantly some common templates can be shared across users\/organizations.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E•\u003Cspan class=\"Apple-tab-span\" style=\"white-space: pre;\"\u003E \u003C\/span\u003E\u003Cb\u003ELife cycle\u003C\/b\u003E – Falcon supports standard data management functions off the shelf, however the same doesn’t cater to every user’s requirement and might require customization. Falcon team is currently working on opening this up and allowing this to be customized per deployment to cater to specific needs of a user.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E•\u003Cspan class=\"Apple-tab-span\" style=\"white-space: pre;\"\u003E \u003C\/span\u003E\u003Cb\u003EOperational simplification\u003C\/b\u003E – When Falcon becomes the de-facto platform (as is the case with some of the users), the richness of dependency information contained can be leveraged to operationally simplify how data processing is managed. Today handling infrastructure outage\/maintenance or degradation or application failures can stall large pipelines causing cascading issues. Dependency information in Falcon can be used to seamlessly recover from these without any manual intervention.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E•\u003Cspan class=\"Apple-tab-span\" style=\"white-space: pre;\"\u003E \u003C\/span\u003E\u003Cb\u003EPipeline designer\u003C\/b\u003E – This is a forward-looking capability in Falcon that enables big data ETL pipelines to be authored visually. This would generate code in language such as Apache Pig and wrap them in appropriate Falcon process and define appropriate feeds.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Ch2 style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003ECan you elaborate on key desired components of big data governance regardless of tool capabilities at this stage?\u003C\/span\u003E\u003C\/h2\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003ESecurity, Quality, Provenance and Privacy are fundamental when it comes to data governance\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E•\u003Cspan class=\"Apple-tab-span\" style=\"white-space: pre;\"\u003E \u003C\/span\u003E\u003Cb\u003EQuality\u003C\/b\u003E – Quality of data is one of the most critical components and there has to be convenient ways to both audit the system for data quality and also build proactive mechanism to cut out any sources of inaccuracies\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E•\u003Cspan class=\"Apple-tab-span\" style=\"white-space: pre;\"\u003E \u003C\/span\u003E\u003Cb\u003EProvenance \u003C\/b\u003E– Organizations typically have complex data flows and often times it is challenging to figure the lineage of this data. To be able to get this lineage at a dataset level, field level and at a record level (in that order of importance) is very important.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E•\u003Cspan class=\"Apple-tab-span\" style=\"white-space: pre;\"\u003E \u003C\/span\u003E\u003Cb\u003ESecurity\u003C\/b\u003E – This is fundamental and hygiene to any data system. Authentication, Authorization and Audit trail are non-negotiable. Every user has to be authenticated and all access to data is to be authorized and audited.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E•\u003Cspan class=\"Apple-tab-span\" style=\"white-space: pre;\"\u003E \u003C\/span\u003E\u003Cb\u003EPrivacy \u003C\/b\u003E– Data anonymization is one of the key techniques to conform to laws and regulation of the land. This is something that the data systems have to natively support or enable.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Ch2 style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EWhy would an enterprise not prefer to use commercial tools (like Informatica) and rather use open source Apache Falcon?\u003C\/span\u003E\u003C\/h2\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EApache Falcon is a Hadoop first data management system and integrates well with standard components in the big data open source eco systems that are widely adopted. This native integration with Hadoop is what makes it a tool of choice. Apache Falcon being available under liberal APL 2.0 license and housed under ASF allows users\/organizations to experiment with it easily and also enable them to contribute their extensions. Recent elevation of Apache Falcon to a top-level project also assures the users about the community driven development process adopted within the Falcon project.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Ch2 style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EIf someone is using Cloudera distribution, what are the options for him?\u003C\/span\u003E\u003C\/h2\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EApache Falcon is distribution agnostic and should work (with some minor tweaks) for anyone using Apache Hadoop 2.5.0 and above along with Oozie 4.1.0. \u0026nbsp;There are plenty of users who use Apache Falcon along with HDP. One of the largest users of Apache Falcon has used it along with CDH 3 and CDH 4, and there are some users who have tried using Apache Falcon with MapR distribution as well.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: right;\"\u003E\u003Ci style=\"font-family: Georgia, 'Times New Roman', serif;\"\u003E\u003Ca href=\"http:\/\/www.hadoopsphere.com\/2015\/03\/data-pipelines-with-apache-falcon.html\"\u003E\u003Cspan style=\"font-size: large;\"\u003E\u0026lt;\u0026lt;\u0026nbsp;\u003C\/span\u003E\u003C\/a\u003E\u003C\/i\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif;\"\u003E\u003Ci\u003E\u003Ca href=\"http:\/\/www.hadoopsphere.com\/2015\/03\/data-pipelines-with-apache-falcon.html\"\u003E\u003Cspan style=\"font-size: large;\"\u003EData pipelines with Apache Falcon\u003C\/span\u003E\u003C\/a\u003E\u0026nbsp;\u003C\/i\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cbr \/\u003E\u003Ctable bgcolor=\"#CCCCCC\" style=\"text-align: justify;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-zuEiErHSQ8I\/VQ-BY4ofbyI\/AAAAAAAAAZ4\/dVhcT1oa5Wk\/s1600\/Srikanth-Sundarrajan-Mid-Resolution.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-78wI6jYla3Y\/VRBNwjPZlsI\/AAAAAAAAAaI\/XNuApSN11rM\/s1600\/Srikanth-Sundarrajan.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/1.bp.blogspot.com\/-78wI6jYla3Y\/VRBNwjPZlsI\/AAAAAAAAAaI\/XNuApSN11rM\/s1600\/Srikanth-Sundarrajan.png\" height=\"195\" width=\"200\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif;\"\u003E\u003Cb\u003ESrikanth Sundarrajan\u003C\/b\u003E works at Inmobi Technology Services, helping architect and build their next generation data management system. He is one of the key contributors to Apache Falcon and currently VP of the project. He has been involved in various projects under the Apache Hadoop umbrella including Apache Lens, Apache Hadoop-core, and Apache Oozie. He has been working with distributed processing systems for over a decade and Hadoop in particular over the last 7 years. Srikanth holds a graduate degree in Computer Engineering from University of Southern California.\u003C\/span\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Ctable bgcolor=\"#CCCCCC\" style=\"text-align: justify;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd\u003E\u003Ch3\u003E\u003Ca href=\"https:\/\/docs.google.com\/forms\/d\/1mqTF9CWSzmKYTo5uo-yUYpnBuN1-2H10cREgOvQKUeA\/viewform?fbzx=7326483546417528301\" target=\"_blank\"\u003E\u003Cspan style=\"color: #0b5394; font-family: Georgia, Times New Roman, serif; font-size: small;\"\u003E\u003Cb\u003ECall for Papers : HadoopSphere Virtual Conclave\u0026gt;\u0026gt;\u003C\/b\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/h3\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"http:\/\/www.hadoopsphere.com\/feeds\/118029770767648080\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"http:\/\/www.hadoopsphere.com\/2015\/03\/governance-in-data-lake.html#comment-form","title":"0 Comments"},{"rel":"edit","type":"application/atom+xml","href":"http:\/\/www.blogger.com\/feeds\/2868824907842590784\/posts\/default\/118029770767648080"},{"rel":"self","type":"application/atom+xml","href":"http:\/\/www.blogger.com\/feeds\/2868824907842590784\/posts\/default\/118029770767648080"},{"rel":"alternate","type":"text/html","href":"http:\/\/www.hadoopsphere.com\/2015\/03\/governance-in-data-lake.html","title":"Governance in a data lake"}],"author":[{"name":{"$t":"Hadoop Reporter"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/08867930107048620070"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"http:\/\/img2.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"http:\/\/1.bp.blogspot.com\/-kKDI5-kGWP0\/VRK8cr580-I\/AAAAAAAAAaY\/MvP9_e-3lr0\/s72-c\/apache-falcon-governance-hadoopsphere.png","height":"72","width":"72"},"thr$total":{"$t":"0"}},{"id":{"$t":"tag:blogger.com,1999:blog-2868824907842590784.post-9150907784520021810"},"published":{"$t":"2015-03-10T13:00:00.001-07:00"},"updated":{"$t":"2015-03-13T01:43:26.487-07:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"industry"},{"scheme":"http://www.blogger.com/atom/ns#","term":"influencers"}],"title":{"type":"text","$t":"Top Big Data influencers of 2014"},"content":{"type":"html","$t":"\u003Cdiv dir=\"ltr\" style=\"text-align: left;\" trbidi=\"on\"\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-l3fbWZcr2qg\/VP9KRghK_mI\/AAAAAAAAAZI\/pjkSjDEgzvs\/s1600\/hadoopsphere-big-data-influencers-2014.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/4.bp.blogspot.com\/-l3fbWZcr2qg\/VP9KRghK_mI\/AAAAAAAAAZI\/pjkSjDEgzvs\/s1600\/hadoopsphere-big-data-influencers-2014.png\" height=\"320\" width=\"295\" \/\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EBig Data is an exciting technology space innovating at a pace probably never seen before. With a dynamic ecosystem and scorching pace of product development, it is easy to be left behind. However, thanks to visionaries in this ecosystem who have been able to decode the maze and set things right for us, we have been seeing successful big data use cases and implementations.\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EHadoopSphere presents below its annual list of top big data influencers. This list reflects the people, products, organizations and portals that exercised the most influence on big data and ecosystem in a particular year. The influencers have been listed in the following categories:\u003C\/span\u003E\u003C\/div\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cul style=\"text-align: left;\"\u003E\u003Cli style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EAnalysts\u003C\/span\u003E\u003C\/li\u003E\u003Cli style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EOnline Media\u003C\/span\u003E\u003C\/li\u003E\u003Cli style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EProducts\u003C\/span\u003E\u003C\/li\u003E\u003Cli style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003ESocial Media\u003C\/span\u003E\u003C\/li\u003E\u003Cli style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EAngels\u003C\/span\u003E\u003C\/li\u003E\u003Cli style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EThought Leaders\u003C\/span\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Ca href=\"https:\/\/drive.google.com\/file\/d\/0BzQMQQg8gvgKNklBSS1wQjF3QUU\/view?usp=sharing\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EClick here to read the methodology used.\u003C\/span\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EThe info-graphic below shows the Top Big Data influencers of 2014 as ranked by HadoopSphere.\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-9q5HPwMCY_g\/VP9KzcuNBdI\/AAAAAAAAAZQ\/sCSBTiKhsuo\/s1600\/hadoopsphere-influencers-list-2014.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/2.bp.blogspot.com\/-9q5HPwMCY_g\/VP9KzcuNBdI\/AAAAAAAAAZQ\/sCSBTiKhsuo\/s1600\/hadoopsphere-influencers-list-2014.png\" height=\"640\" width=\"480\" \/\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E \u003C\/span\u003E\u003Cbr \/\u003E\u003Ctable border=\"1\" cellpadding=\"0\" cellspacing=\"0\"\u003E \u003Ctbody\u003E\u003Ctr\u003E   \u003Ctd colspan=\"2\" style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ch3\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EAnalysts:\u003C\/span\u003E\u003C\/h3\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"https:\/\/www.forrester.com\/Mike-Gualtieri\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EMike Gualtieri\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EIn 2013, Mike Gualtieri of Forrester had predicted that big data will be the \u003Ca href=\"http:\/\/www.hadoopsphere.com\/2013\/01\/big-data-forecast-digest-for-2013.html\"\u003ETime person of the year\u003C\/a\u003E. Well, it almost came true with a big data use (or misuse) case (Edward Snowden) making it to the runner up of Time person of the year. Besides occasionally playing sorcerer, Mike has remained one of the most well respected analyst in year 2014 commanding a comprehensive vison and view for the data ecosystem.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"http:\/\/www.monash.com\/curtbio.html\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003ECurt Monash\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EIf you have not been reading Curt Monash, you may have been living on an island probably. And, if you got a few incisive comments on your product, well, then you are probably part of an urban elite in this big data city. Don’t expect courtesies, just expect plain honest assessment and that too with technical depth from Curt.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"http:\/\/www.ovum.com\/authors\/tony-baer\/\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003ETony Baer\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EConsistency and clarity are the forte of Tony Baer, Ovum’s principal analyst. Presume a consistent sane advice with clear cut guidance on what to expect and what not to expect from Tony. He has remained a top influencer in big data and Hadoop ecosystem consistently for another year.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd colspan=\"2\" style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ch3\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EOnline Media:\u003C\/span\u003E\u003C\/h3\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"http:\/\/www.tdwi.org\/\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003ETDWI\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EWith research papers, blogs, webinars and education events, TDWI continued to attract eye-balls and sponsors alike making it one of the top focused industry portals.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"http:\/\/www.ibmdatamag.com\/\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EIBM\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EIBM is a technology company but runs a media machinery of its own. Its data initiatives like IBM Data Magazine, IBM Big Data Hub, Big Data University, Developer Works, Red books combined together continued to be among top traffic getters. Though the content may be in part IBM specific, overall it did a great work of educating big data community.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"http:\/\/www.dzone.com\/\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EDZone\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EWith ‘smart content’ for big data professionals, DZone continued to encourage community to contribute links, articles, guides and ‘refcardz’. DZone ensured both quality and good volume traffic resulting in a high influence on techies.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd colspan=\"2\" style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ch3\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EProducts:\u003C\/span\u003E\u003C\/h3\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"http:\/\/spark.apache.org\/\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003ESpark\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EDo we need to say anything about this obvious choice? Apache Spark has been the flavor of all seasons since 2014 beginning. With biggest open source community in big data ecosystem, it continued to define and influence the shape of future products.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"http:\/\/scala-lang.org\/\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EScala\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EAlthough technically a programming language and not a product, Scala is listed here as it marches its way ahead to become a preferred language for big data programming. With both Apache Spark and Flink promoting it big time, the simplicity and power of the language became more obvious. We expect Scala to become one of the most powerful languages in few years.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"http:\/\/kafka.apache.org\/\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EKafka\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EIf you need to quote an example of word-of-mouth success, here it is. Apache Kafka was developed at LinkedIn and was not a part of major Hadoop distributions till early 2015. However, still it has emerged as a preferred choice for data ingestion and has seen adoption by internet companies, financial majors and travel portals among others.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd colspan=\"2\" style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ch3\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003ESocial Media:\u003C\/span\u003E\u003C\/h3\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"https:\/\/twitter.com\/bigdata\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EBen Lorica\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EIf one has a dream twitter handle like ‘bigdata’, it may not be sheer co-incidence. It probably shows the handle owner has been talking about big data before we heard of it. Ben Lorica is the Chief Data Scientist and Director of Content Strategy for Data at O'Reilly Media, Inc and commands the ‘bigdata’ twitter handle with its impressive following.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"https:\/\/twitter.com\/kdnuggets\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EGregory Piatetsky-Shapiro\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EAs the President of KDnuggets, which provides analytics and data mining consulting, Gregory is a founder of KDD (Knowledge Discovery and Data mining conferences). He is one of the leading social influencers with his mentions generating huge follower interest.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"https:\/\/twitter.com\/kirkdborne\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EKirk Borne\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EDr.Kirk Borne is Professor of Astrophysics and Computational Science at George Mason University. As a data scientist and astrophysicist, he mostly talks about big data on social media and continues to attract huge follower base.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd colspan=\"2\" style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ch3\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EAngel Investors:\u003C\/span\u003E\u003C\/h3\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"https:\/\/twitter.com\/naval\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003ENaval Ravikant\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EEntrepreneur and an angel investor, Naval Ravikant is co-founder of AngelList. Through this terrific forum and other offline activities, he has been drumming up the cause of many startups and taking them through the funding gates.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"http:\/\/www.dcvc.com\/\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EData Collective\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EDCVC (aka Data Collective) is a seed and early stage venture capital fund that invests in big data companies. Its extended team consists of more than 35 “Equity Partners,” who are notable technical founders and executives, data scientists and engineers. Some of the notable portfolio companies include Blue Data, Continuity, Elasticsearch, Citus Data.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd colspan=\"2\" style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ch3\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EThought Leaders:\u003C\/span\u003E\u003C\/h3\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"https:\/\/twitter.com\/mikeolson\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EMike Olson\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EAs the Chief Strategy Officer of Cloudera, Mike Olson is a leader whose vision has been driving his company and much of the Hadoop ecosystem. His unbridled passion combined with ability to foresee market dynamics makes him one of the biggest thought leaders and influencers in entire information technology arena. From marketing Hadoop to touting Impala as MPP or mentoring competitive Spark, Mike has exhibited unparalleled transformational leadership characteristics.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003Ctr\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 156px;\"\u003E\u003Ca href=\"http:\/\/www.gartner.com\/analyst\/38961\/Merv-Adrian\" rel=\"nofollow\" target=\"_blank\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EMerv Adrian\u003C\/span\u003E\u003C\/a\u003E\u003C\/td\u003E   \u003Ctd style=\"border-color: rgb(204, 204, 204); text-align: left; vertical-align: top; width: 468px;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EMerv Adrian is Research VP at Gartner and the more known face of the research company in social media and event circles. Each year Gartner somehow manages to hit a rough note with the big data vendors, be it “trough of disillusionment” or “data lake fallacy” comment. However, Merv with his astute knowledge of Hadoop ecosystem, BI world and technology lifecycles has made people understand the discordant notes to apply caution, restrain and intelligence beyond the obvious hype. And, that’s what thought leaders do – create sense and path out of chaos and conflicts. Pro Tip: Merv may not agree with you but will still have you and him understand a common path.\u003C\/span\u003E\u003C\/td\u003E  \u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Ca href=\"http:\/\/www.hadoopsphere.com\/2013\/12\/top-big-data-influencers-of-2013.html\"\u003E\u0026lt;\u0026lt; Top big data influencers of 2013\u003C\/a\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cspan style=\"color: #999999; font-size: large;\"\u003E\u003Ca href=\"http:\/\/www.hadoopsphere.com\/2014\/05\/big-data-and-hadoop-training.html\" style=\"background-color: cyan;\"\u003EGet trained in Hadoop, Spark and Big Data technologies - Enroll now\u003C\/a\u003E\u003C\/span\u003E\u003C\/div\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"http:\/\/www.hadoopsphere.com\/feeds\/9150907784520021810\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"http:\/\/www.hadoopsphere.com\/2015\/03\/top-big-data-influencers-of-2014.html#comment-form","title":"1 Comments"},{"rel":"edit","type":"application/atom+xml","href":"http:\/\/www.blogger.com\/feeds\/2868824907842590784\/posts\/default\/9150907784520021810"},{"rel":"self","type":"application/atom+xml","href":"http:\/\/www.blogger.com\/feeds\/2868824907842590784\/posts\/default\/9150907784520021810"},{"rel":"alternate","type":"text/html","href":"http:\/\/www.hadoopsphere.com\/2015\/03\/top-big-data-influencers-of-2014.html","title":"Top Big Data influencers of 2014"}],"author":[{"name":{"$t":"Hadoop Reporter"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/08867930107048620070"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"http:\/\/img2.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"http:\/\/4.bp.blogspot.com\/-l3fbWZcr2qg\/VP9KRghK_mI\/AAAAAAAAAZI\/pjkSjDEgzvs\/s72-c\/hadoopsphere-big-data-influencers-2014.png","height":"72","width":"72"},"thr$total":{"$t":"1"}},{"id":{"$t":"tag:blogger.com,1999:blog-2868824907842590784.post-6387060021573421757"},"published":{"$t":"2015-02-19T10:44:00.001-08:00"},"updated":{"$t":"2015-02-19T10:44:16.187-08:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"industry"}],"title":{"type":"text","$t":"Ciao latency, hallo speed"},"content":{"type":"html","$t":"\u003Cdiv dir=\"ltr\" style=\"text-align: left;\" trbidi=\"on\"\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-Uz4JwWJnzqY\/VOYt4HaWk9I\/AAAAAAAAAYY\/j9WNnCKrbGA\/s1600\/flink-speed-hadoopsphere.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/3.bp.blogspot.com\/-Uz4JwWJnzqY\/VOYt4HaWk9I\/AAAAAAAAAYY\/j9WNnCKrbGA\/s1600\/flink-speed-hadoopsphere.png\" height=\"265\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EWhile Hadoop had long been suspect to higher latency engines due to inherent MapReduce design, the same does not hold true any longer. With advent of faster processing engines for Hadoop, distributed data can now be processed with lower latency and in more efficient manner. We continue our discussion with Stephan Ewen to find out more about Apache Flink for distributed data processing. In the \u003Ca href=\"http:\/\/www.hadoopsphere.com\/2015\/02\/distributed-data-processing-with-apache.html\"\u003Efirst part of the discussion\u003C\/a\u003E, we focused on technical aspects of Apache Flink's working. Now we turn our attention to the comparative use and fitment in the overall Hadoop ecosystem. Read below to find out what Ewen has to say.\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Ch2 style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EHow does Apache Flink technically compare to Spark and are there any performance benefits?\u003C\/span\u003E\u003C\/h2\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EFlink and Spark start from different points in the system design space. In the end, it is really a question of finding the right tool for a particular workload. Flink’s runtime has some unique features that are beneficial in certain workloads.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EFlink uses data streaming rather than batch processing as much as possible to execute both batch and streaming programs. This means for streaming programs that they are executed in a real streaming fashion, with more flexible windows, lower latency, and long living operators. For batch programs, intermediate data sets are often piped to their consumers as they are created, saving on memory and disk I\/O for data sets larger than memory.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EFlink is memory-aware, operating on binary data rather than Java objects. This makes heavy data crunching inside the JVM efficient, and alleviates many of the problems that the JVM has for data-intensive workloads.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EFlink optimizes programs in a pre-flight stage using a cost-based optimizer rather than eagerly sending programs to the cluster. This may have advantages in performance, and helps the debuggability of the programs.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EFlink has dedicated iteration operators in the APIs and in the runtime. These operators support fast iterations and allow the system to be very efficient, for example, \u003Ca href=\"http:\/\/data-artisans.com\/data-analysis-with-flink.html\" rel=\"nofollow\" target=\"_blank\"\u003Ein case of graphs\u003C\/a\u003E.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Ch2 style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EWhat lies ahead on the roadmap for Apache Flink in 2015?\u003C\/span\u003E\u003C\/h2\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EThe Flink community has recently discussed in the developer mailing list and published a roadmap for 2015. The roadmap includes more libraries and applications on top of Flink (e.g., a graph and a Machine Learning library), support for interactive programs, improvements to streaming functionality, performance, and robustness, as well as integration with other Apache and open source projects. (See here for \u003Ca href=\"https:\/\/cwiki.apache.org\/confluence\/display\/FLINK\/Flink+Roadmap\" rel=\"nofollow\" target=\"_blank\"\u003Emore details on the roadmap\u003C\/a\u003E)\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Ch2 style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EIn which use cases do you see Apache Flink being a good fit vis-à-vis other ecosystem options?\u003C\/span\u003E\u003C\/h2\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EFlink’s batch programs shine when using data-intensive and compute intensive pipelines - and even more so when including iterative parts. This includes both complex ETL jobs, as well as data intensive machine learning algorithms. Flink’s architecture is designed to combine robustness with the ease of use and performance benefits of modern APIs and in-memory processing. A good example can be \u003Ca href=\"http:\/\/data-artisans.com\/computing-recommendations-with-flink.html\" rel=\"nofollow\" target=\"_blank\"\u003Erecommendation systems\u003C\/a\u003E for objects like new movies on Netflix, or shopping articles on Amazon.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EFor data streaming use cases, the newly streaming API (beta status) offers beautiful high-level APIs with flexible windowing semantics, backed by a low-latency execution engine.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003EGraph algorithms work particularly well on Flink, due to its strong support for (stateful) iterative algorithms. As one of the first major libraries, Flink’s graph library “Gelly” has been added in its first version.\u003C\/span\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: right;\"\u003E\u003Ci\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Ca href=\"http:\/\/www.hadoopsphere.com\/2015\/02\/distributed-data-processing-with-apache.html\"\u003E\u0026lt;\u0026lt;Distributed data processing with Apache Flink\u003C\/a\u003E\u003C\/span\u003E\u003C\/i\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Ctable bgcolor=\"#CCCCCC\" style=\"text-align: justify;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-BHrqU3n-f8c\/VOOTaVp54XI\/AAAAAAAAAXw\/u_Z1BC4XlpA\/s1600\/stephan_ewen.jpg\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/4.bp.blogspot.com\/-BHrqU3n-f8c\/VOOTaVp54XI\/AAAAAAAAAXw\/u_Z1BC4XlpA\/s1600\/stephan_ewen.jpg\" height=\"200\" width=\"200\" \/\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif;\"\u003E\u003Cb\u003EStephan Ewen\u003C\/b\u003E is committer and Vice President of Apache Flink and co-founder and CTO of \u003Ca href=\"http:\/\/data-artisans.com\/\" rel=\"nofollow\" target=\"_blank\"\u003Edata Artisans\u003C\/a\u003E, a Berlin-based company that is developing and contributing to Apache Flink. Before founding data Artisans, Stephan was leading the development of Flink since the early days of the project (then called Stratosphere). Stephan holds a PhD in Computer Science from the University of Technology, Berlin, and has been with IBM Research and Microsoft Research in the course of several internships.\u003C\/span\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003Cspan style=\"font-family: Georgia, Times New Roman, serif; font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: justify;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"http:\/\/www.hadoopsphere.com\/feeds\/6387060021573421757\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"http:\/\/www.hadoopsphere.com\/2015\/02\/ciao-latency-hallo-speed.html#comment-form","title":"0 Comments"},{"rel":"edit","type":"application/atom+xml","href":"http:\/\/www.blogger.com\/feeds\/2868824907842590784\/posts\/default\/6387060021573421757"},{"rel":"self","type":"application/atom+xml","href":"http:\/\/www.blogger.com\/feeds\/2868824907842590784\/posts\/default\/6387060021573421757"},{"rel":"alternate","type":"text/html","href":"http:\/\/www.hadoopsphere.com\/2015\/02\/ciao-latency-hallo-speed.html","title":"Ciao latency, hallo speed"}],"author":[{"name":{"$t":"Hadoop Reporter"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/08867930107048620070"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"http:\/\/img2.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"http:\/\/3.bp.blogspot.com\/-Uz4JwWJnzqY\/VOYt4HaWk9I\/AAAAAAAAAYY\/j9WNnCKrbGA\/s72-c\/flink-speed-hadoopsphere.png","height":"72","width":"72"},"thr$total":{"$t":"0"}}]}});