2012 |
Furche, Tim; Gottlob, Georg; Grasso, Giovanni; Guo, Xiaonan; Orsi, Giorgio; Schallhart, Christian OPAL: automated form understanding for the deep web (Inproceeding) International Conference on Word Wide Web (WWW'12), Page(s): 829-838, ACM, 2012. @inproceedings{furche12:_forms_form_patter, name = {OPAL: automated form understanding for the deep web}, author = {Tim Furche and Georg Gottlob and Giovanni Grasso and Xiaonan Guo and Giorgio Orsi and Christian Schallhart}, note = {accepted for publication}, url = {http://www.giovannigrasso.it/wp-content/uploads/2012/02/www-2012-OPAL-Forms-form-Patterns-Reusable-Form-Understanding.pdf}, year = {2012}, date = {2012-04-20}, booktitle = {International Conference on Word Wide Web (WWW'12)}, pages = {829-838}, publisher = {ACM}, abstract = {Forms are our gates to the web. They enable us to access the deep content of web sites. Automatic form understanding unlocks this content for applications ranging from crawlers to meta-search engines and is essential for improving usability and accessibility of the web. Form understanding has received surprisingly little attention other than as component in specific applications such as crawlers. No comprehensive approach to form understanding exists and previous works disagree even in the definition of the problem. In this paper, we present OPAL, the first comprehensive approach to form understanding. We identify form labeling and form interpretation as the two main tasks involved in form understanding. On both problems OPAL pushes the state of the art: For form labeling, it combines signals from the text, structure, and visual rendering of a web page, yielding robust characterisations of common design patterns. In extensive experiments on the ICQ and TEL-8 benchmarks and a set of 200 modern web forms OPAL outperforms previous approaches by a significant margin. For form interpretation, we introduce a template language to describe frequent form patterns. These two parts of OPAL combined yield form understanding with near perfect accuracy (> 98%).}, note = {accepted for publication}, } Forms are our gates to the web. They enable us to access the deep content of web sites. Automatic form understanding unlocks this content for applications ranging from crawlers to meta-search engines and is essential for improving usability and accessibility of the web. Form understanding has received surprisingly little attention other than as component in specific applications such as crawlers. No comprehensive approach to form understanding exists and previous works disagree even in the definition of the problem. In this paper, we present OPAL, the first comprehensive approach to form understanding. We identify form labeling and form interpretation as the two main tasks involved in form understanding. On both problems OPAL pushes the state of the art: For form labeling, it combines signals from the text, structure, and visual rendering of a web page, yielding robust characterisations of common design patterns. In extensive experiments on the ICQ and TEL-8 benchmarks and a set of 200 modern web forms OPAL outperforms previous approaches by a significant margin. For form interpretation, we introduce a template language to describe frequent form patterns. These two parts of OPAL combined yield form understanding with near perfect accuracy (> 98%). |
Kranzdorf, Jochen; Sellers, Andrew; Grasso, Giovanni; Schallhart, Christian; Furche, Tim Visual OXPath: Robust Wrapping by Example (Inproceeding) International Conference on Word Wide Web (WWW'12), 2012. @inproceedings{kranzdorf12:_visual_oxpat, name = {Visual OXPath: Robust Wrapping by Example}, author = {Jochen Kranzdorf and Andrew Sellers and Giovanni Grasso and Christian Schallhart and Tim Furche}, url = {http://www.giovannigrasso.it/wp-content/uploads/2012/02/www-2012-Visual-OXPath-Robust-Wrapping-by-Example.pdf}, year = {2012}, date = {2012-04-19}, booktitle = {International Conference on Word Wide Web (WWW'12)}, volume = {Companion}, number = {369-373}, abstract = {Good examples are hard to find, particularly in wrapper induction: Picking even one wrong example can spell disaster by yielding overgeneralized or overspecialized wrappers. Such wrappers extract data with low precision or recall, unless adjusted by human experts at significant cost. Visual OXPath is an open-source, visual wrapper induction system that requires minimal examples and eases wrapper refinement: Often it derives the intended wrapper from a single example through sophisticated heuristics that determine the best set of similar examples. To ease wrapper refinement, it offers a list of wrappers ranked by example similarity and robustness. Visual OXPath offers extensive visual feedback for this refinement which can be performed without any knowledge of the underlying wrapper language. Where further refinement by a human wrapper is needed, Visual OXPath profits from being based on OXPath, a declarative wrapper language that extends XPath with a thin layer of features necessary for extraction and page navigation.}, } Good examples are hard to find, particularly in wrapper induction: Picking even one wrong example can spell disaster by yielding overgeneralized or overspecialized wrappers. Such wrappers extract data with low precision or recall, unless adjusted by human experts at significant cost. Visual OXPath is an open-source, visual wrapper induction system that requires minimal examples and eases wrapper refinement: Often it derives the intended wrapper from a single example through sophisticated heuristics that determine the best set of similar examples. To ease wrapper refinement, it offers a list of wrappers ranked by example similarity and robustness. Visual OXPath offers extensive visual feedback for this refinement which can be performed without any knowledge of the underlying wrapper language. Where further refinement by a human wrapper is needed, Visual OXPath profits from being based on OXPath, a declarative wrapper language that extends XPath with a thin layer of features necessary for extraction and page navigation. |
Furche, Tim; Gottlob, Georg; Grasso, Giovanni; Gunes, Ömer; Guo, Xiaonan; Kravchenko, Andrey; Orsi, Giorgio; Schallhart, Christian; Sellers, Andrew; Wang, Cheng DIADEM: Domain-centric, Intelligent, Automated Data Extraction Methodology (Inproceeding) International Conference on Word Wide Web (WWW'12 ), Page(s): 267-270, ACM, 2012. @inproceedings{furche12:_diadem, name = {DIADEM: Domain-centric, Intelligent, Automated Data Extraction Methodology}, author = {Tim Furche and Georg Gottlob and Giovanni Grasso and Ömer Gunes and Xiaonan Guo and Andrey Kravchenko and Giorgio Orsi and Christian Schallhart and Andrew Sellers and Cheng Wang}, url = {http://www.giovannigrasso.it/wp-content/uploads/2012/02/www-2012-DIADEM-Domain-centric-Intelligent-Automated-Data-Extraction-Methodology.pdf}, year = {2012}, date = {2012-04-19}, booktitle = {International Conference on Word Wide Web (WWW'12 )}, volume = {Companion}, pages = {267-270}, publisher = {ACM}, abstract = {Search engines are the sinews of the web. These sinews have become strained, however: Where the web’s function once was a mix of library and yellow pages, it has become the central marketplace for information of almost any kind. We search more and more for objects with specific characteristics, a car with a certain milage, an affordable apartment close to a good school, or the latest accessory for our phones. Search engines all too often fail to provide reasonable answers, making us sift through dozens of websites with thousands of offers—never to be sure a better offer isn’t just around the corner. What search engines are missing is understanding of the objects and their attributes published on websites. Automatically identifying and extracting these objects is akin to alchemy: transforming unstructured web information into highly structured data with near perfect accuracy. With DIADEM we present a formula for this transformation, but at a price: DIADEM identifies and extracts data from a website with high accuracy. The price is that for this task we need to provide DIADEM with extensive knowledge about the ontology and phenomenology of the domain, i.e., about entities (and relations) and about the representation of these entities in the textual, structural, and visual language of a website of this domain. In this demonstration, we demonstrate with a first prototype of DIADEM that, in contrast to alchemists, DIADEM has developed a viable formula.}, } Search engines are the sinews of the web. These sinews have become strained, however: Where the web’s function once was a mix of library and yellow pages, it has become the central marketplace for information of almost any kind. We search more and more for objects with specific characteristics, a car with a certain milage, an affordable apartment close to a good school, or the latest accessory for our phones. Search engines all too often fail to provide reasonable answers, making us sift through dozens of websites with thousands of offers—never to be sure a better offer isn’t just around the corner. What search engines are missing is understanding of the objects and their attributes published on websites. Automatically identifying and extracting these objects is akin to alchemy: transforming unstructured web information into highly structured data with near perfect accuracy. With DIADEM we present a formula for this transformation, but at a price: DIADEM identifies and extracts data from a website with high accuracy. The price is that for this task we need to provide DIADEM with extensive knowledge about the ontology and phenomenology of the domain, i.e., about entities (and relations) and about the representation of these entities in the textual, structural, and visual language of a website of this domain. In this demonstration, we demonstrate with a first prototype of DIADEM that, in contrast to alchemists, DIADEM has developed a viable formula. |
Furche, Tim; Grasso, Giovanni; Orsi, Giorgio; Schallhart, Christian; Wang, Cheng Automatically Learning Gazetteers from the Deep Web (Inproceeding) International Conference on Word Wide Web (WWW'12), Page(s): 191-195, 2012. @inproceedings{furche12:_autom_learn_gazet_from_deep_web, name = {Automatically Learning Gazetteers from the Deep Web}, author = {Tim Furche and Giovanni Grasso and Giorgio Orsi and Christian Schallhart and Cheng Wang}, url = {http://www.giovannigrasso.it/wp-content/uploads/2012/02/www-2012-AMBER-Automatically-Learning-Gazetteers-from-the-Deep-Web.pdf}, year = {2012}, date = {2012-04-19}, booktitle = {International Conference on Word Wide Web (WWW'12)}, volume = {Companion }, pages = {191-195}, abstract = {Web extraction is the task of turning unstructured HTML into knowledge. Computers are able to generate annotations of unstructured HTML, but it is more important to turn those annotations into structured knowledge. Unfortunately, the current systems extracting knowledge from result pages lack accuracy. In this proposal, we present AMBER, a system fully automated turning annotations to structured knowledge from any result page of a given domain. A M B E R observes basic domain attributes on a page and leverages repeated occurrences of similar attributes to group related attributes into records. This contrasts to previous approaches that analyze the repeated structure only of the HTML, as no domain knowledge is available. Our multi-domain experimental evaluation on hundreds of sites demonstrates that A M B E R achieves accuracy (>98%) comparable to skilled human annotator.}, } Web extraction is the task of turning unstructured HTML into knowledge. Computers are able to generate annotations of unstructured HTML, but it is more important to turn those annotations into structured knowledge. Unfortunately, the current systems extracting knowledge from result pages lack accuracy. In this proposal, we present AMBER, a system fully automated turning annotations to structured knowledge from any result page of a given domain. A M B E R observes basic domain attributes on a page and leverages repeated occurrences of similar attributes to group related attributes into records. This contrasts to previous approaches that analyze the repeated structure only of the HTML, as no domain knowledge is available. Our multi-domain experimental evaluation on hundreds of sites demonstrates that A M B E R achieves accuracy (>98%) comparable to skilled human annotator. |
Guo, Xiaonan; Kranzdorf, Jochen; Furche, Tim; Grasso, Giovanni; Orsi, Giorgio; Schallhart, Christian OPAL: A Passe-partout for Web Forms (Inproceeding) International Conference Word Wide Web (WWW'12), Page(s): 353-357, ACM, 2012. @inproceedings{guo12:_opal, name = {OPAL: A Passe-partout for Web Forms}, author = {Xiaonan Guo and Jochen Kranzdorf and Tim Furche and Giovanni Grasso and Giorgio Orsi and Christian Schallhart}, url = {http://www.giovannigrasso.it/wp-content/uploads/2012/02/www-2012-OPAL-A-Passe-partout-for-Web-Forms.pdf}, year = {2012}, date = {2012-04-19}, booktitle = {International Conference Word Wide Web (WWW'12)}, volume = {Companion}, pages = {353-357}, publisher = {ACM}, abstract = {Web forms are the interfaces of the deep web. Though modern web browsers provide facilities to assist in form filling, this assistance is limited to prior form fillings or keyword matching. Automatic form understanding enables a broad range of applications, including crawlers, meta-search engines, and usability and accessibility support for enhanced web browsing. In this demonstration, we use a novel form understanding approach, OPAL, to assist in form filling even for complex, previously unknown forms. OPAL associates form labels to fields by analyzing structural properties in the HTML encoding and visual features of the page rendering. OPAL interprets this labeling and classifies the fields according to a given domain ontology. The combination of these two properties, allows OPAL to deal effectively with many forms outside of the grasp of existing form filling techniques. In the UK real estate domain, OPAL achieves > 99 form understanding.}, } Web forms are the interfaces of the deep web. Though modern web browsers provide facilities to assist in form filling, this assistance is limited to prior form fillings or keyword matching. Automatic form understanding enables a broad range of applications, including crawlers, meta-search engines, and usability and accessibility support for enhanced web browsing. In this demonstration, we use a novel form understanding approach, OPAL, to assist in form filling even for complex, previously unknown forms. OPAL associates form labels to fields by analyzing structural properties in the HTML encoding and visual features of the page rendering. OPAL interprets this labeling and classifies the fields according to a given domain ontology. The combination of these two properties, allows OPAL to deal effectively with many forms outside of the grasp of existing form filling techniques. In the UK real estate domain, OPAL achieves > 99 form understanding. |
Febbraro, Onofrio; Grasso, Giovanni; Leone, Nicola; Ricca, Francesco JASP: a framework for integrating Answer Set Programming with Java (Inproceeding) Principles of Knowledge Representation and Reasoning (KR2012), 2012. @inproceedings{jdlv, name = {JASP: a framework for integrating Answer Set Programming with Java}, author = {Onofrio Febbraro and Giovanni Grasso and Nicola Leone and Francesco Ricca}, note = {accepted for publication}, year = {2012}, date = {2012-04-18}, booktitle = {Principles of Knowledge Representation and Reasoning (KR2012)}, abstract = {Answer Set Programming (ASP) is a fully-declarative logic programming paradigm, which has been proposed in the area of knowledge representation and non-monotonic reasoning. Nowadays, the formal properties of ASP are well-understood, efficient ASP systems are available, and, recently, ASP has been employed in a few industrial applications. However, ASP technology is not mature for a successful exploitation in industry yet. One of the main obstacles in the development of ASP-based industrial applications is the weak integration of ASP technologies (i.e., ASP programs and solvers) in the well-assessed software development processes and platforms which are tailored for imperative/object-oriented programming languages. In this paper we present a new programming framework blending ASP with the Java programming language. The framework is based on an hybrid language that transparently supports a bilateral interaction between ASP and Java, called JASP. The "impedance mismatch" in knowledge representation between the two programming paradigms is automatically handled, and the object-relational mapping (ORM) can be customized by the programmer. JASP specifications are compliant with the JPA standard for ORM, to perfectly fit extensively-adopted enterprise application development standards. The framework also encompasses an implementation of JASP as a plug-in for the Eclipse platform, called JDLV, which includes a compiler from JASP to Java. JDLV provides a seamless integration of ASP-based technologies within one of the most diffused development environment for Java. Moreover, we show a real-world application developed with JASP and JDLV, which highlights the effectiveness of our ASP--Java integration framework.}, note = {accepted for publication}, } Answer Set Programming (ASP) is a fully-declarative logic programming paradigm, which has been proposed in the area of knowledge representation and non-monotonic reasoning. Nowadays, the formal properties of ASP are well-understood, efficient ASP systems are available, and, recently, ASP has been employed in a few industrial applications. However, ASP technology is not mature for a successful exploitation in industry yet. One of the main obstacles in the development of ASP-based industrial applications is the weak integration of ASP technologies (i.e., ASP programs and solvers) in the well-assessed software development processes and platforms which are tailored for imperative/object-oriented programming languages. In this paper we present a new programming framework blending ASP with the Java programming language. The framework is based on an hybrid language that transparently supports a bilateral interaction between ASP and Java, called JASP. The "impedance mismatch" in knowledge representation between the two programming paradigms is automatically handled, and the object-relational mapping (ORM) can be customized by the programmer. JASP specifications are compliant with the JPA standard for ORM, to perfectly fit extensively-adopted enterprise application development standards. The framework also encompasses an implementation of JASP as a plug-in for the Eclipse platform, called JDLV, which includes a compiler from JASP to Java. JDLV provides a seamless integration of ASP-based technologies within one of the most diffused development environment for Java. Moreover, we show a real-world application developed with JASP and JDLV, which highlights the effectiveness of our ASP--Java integration framework. |
Tim Furche Giovanni Grasso, Andrey Kravchenko; Schallhart, Christian Turn the page: Automated traversal of paginated websites (Inproceeding) International Conference on Web Engineering (ICWE'12), 2012. @inproceedings{furche12:beryl-icwe, name = {Turn the page: Automated traversal of paginated websites}, author = {Tim Furche, Giovanni Grasso, Andrey Kravchenko, and Christian Schallhart}, url = {http://www.giovannigrasso.it/wp-content/uploads/2012/04/icwe-2012-turn-the-page-automated-traversal-of-paginated-websites.pdf}, year = {2012}, date = {2012-04-17}, booktitle = {International Conference on Web Engineering (ICWE'12)}, abstract = {Content-intensive web sites, such as Google or Amazon, paginate their results to accommodate limited screen sizes. Thus, human users and automatic tools alike have to traverse the pagination links when they crawl the site, extract data, or automate common tasks, where these applications require access to the entire result set. Previous approaches, as well as existing crawlers and automation tools, rely on simple heuristics (e.g., considering only the link text), falling back to an exhaustive exploration of the site where those heuristics fail. In particular, focused crawlers and data extraction systems target only fractions of the individual pages of a given site, rendering a highly accurate identification of pagination links essential to avoid the exhaustive exploration of irrelevant pages. We identify pagination links in a wide range of domains and sites with near perfect accuracy (99 these results with a novel framework for web block classification, BERyL, that combines rule-based reasoning for feature extraction and machine learning for feature selection and classification. Through this combination, BERyL is applicable in a wide settings range, adjusted to maximise either precision, recall, or speed. We illustrate how BERyL minimises the effort for feature extraction and evaluate the impact of a broad range of features (content, structural, and visual).}, } Content-intensive web sites, such as Google or Amazon, paginate their results to accommodate limited screen sizes. Thus, human users and automatic tools alike have to traverse the pagination links when they crawl the site, extract data, or automate common tasks, where these applications require access to the entire result set. Previous approaches, as well as existing crawlers and automation tools, rely on simple heuristics (e.g., considering only the link text), falling back to an exhaustive exploration of the site where those heuristics fail. In particular, focused crawlers and data extraction systems target only fractions of the individual pages of a given site, rendering a highly accurate identification of pagination links essential to avoid the exhaustive exploration of irrelevant pages. We identify pagination links in a wide range of domains and sites with near perfect accuracy (99 these results with a novel framework for web block classification, BERyL, that combines rule-based reasoning for feature extraction and machine learning for feature selection and classification. Through this combination, BERyL is applicable in a wide settings range, adjusted to maximise either precision, recall, or speed. We illustrate how BERyL minimises the effort for feature extraction and evaluate the impact of a broad range of features (content, structural, and visual). |
2011 |
Furche, Tim; Gottlob, Georg; Grasso, Giovanni; Schallhart, Christian; Sellers, Andrew OXPath: A Language for Scalable, Memory-efficient Data Extraction from Web Applications (Article) Proceedings of the VLDB Endowment/International Conference on Very Large Databases (VLDB'11), 4, 11, Page(s): 1016--1027, 2011. @article{sellers11:_oxpat1, name = {OXPath: A Language for Scalable, Memory-efficient Data Extraction from Web Applications}, author = {Tim Furche and Georg Gottlob and Giovanni Grasso and Christian Schallhart and Andrew Sellers}, url = {http://www.giovannigrasso.it/wp-content/uploads/2012/02/vldb-2011-OXPath-A-Language-for-Scalable-Memory-efficient-Data-Extraction-from-Web-Applications.pdf}, year = {2011}, date = {2011-01-02}, journal = {Proceedings of the VLDB Endowment/International Conference on Very Large Databases (VLDB'11)}, volume = {4}, number = {11}, pages = {1016--1027}, abstract = {The evolution of the web has outpaced itself: The growing wealth of information and the increasing sophistication of interfaces necessitate automated processing. Web automation and extraction technologies have been overwhelmed by this very growth. To address this trend, we identify four key requirements of web extraction: (1) Interact with sophisticated web application interfaces, (2) Precisely capture the relevant data for most web extraction tasks, (3) Scale with the number of visited pages, and (4) Readily embed into existing web technologies. We introduce OXPath, an extension of XPath for interacting with web applications and for extracting information thus revealed. It addresses all the above requirements. OXPath’s page-at-a-time evaluation guarantees memory use independent of the number of visited pages, yet remains polynomial in time. We validate experimentally the theoretical complexity and demonstrate that its evaluation is dominated by the page rendering of the underlying browser. Our experiments show that OXPath outperforms existing commercial and academic data extraction tools by a wide margin. OXPath is available under an open source license.}, } The evolution of the web has outpaced itself: The growing wealth of information and the increasing sophistication of interfaces necessitate automated processing. Web automation and extraction technologies have been overwhelmed by this very growth. To address this trend, we identify four key requirements of web extraction: (1) Interact with sophisticated web application interfaces, (2) Precisely capture the relevant data for most web extraction tasks, (3) Scale with the number of visited pages, and (4) Readily embed into existing web technologies. We introduce OXPath, an extension of XPath for interacting with web applications and for extracting information thus revealed. It addresses all the above requirements. OXPath’s page-at-a-time evaluation guarantees memory use independent of the number of visited pages, yet remains polynomial in time. We validate experimentally the theoretical complexity and demonstrate that its evaluation is dominated by the page rendering of the underlying browser. Our experiments show that OXPath outperforms existing commercial and academic data extraction tools by a wide margin. OXPath is available under an open source license. |
Sellers, Andrew; Furche, Tim; Gottlob, Georg; Grasso, Giovanni; Schallhart, Christian Taking the OXPath down the Deep Web (Inproceeding) International Conference on Extending Database Technology (EDBT'11), Page(s): 542--545, 2011. @inproceedings{sellers11:_takin_oxpat_down_deep_web, name = {Taking the OXPath down the Deep Web}, author = {Andrew Sellers and Tim Furche and Georg Gottlob and Giovanni Grasso and Christian Schallhart}, url = {http://doi.acm.org/10.1145/1951365.1951436}, year = {2011}, date = {2011-01-01}, booktitle = {International Conference on Extending Database Technology (EDBT'11)}, pages = {542--545}, abstract = {Although deep web analysis has been studied extensively, there is no succinct formalism to describe user interactions with AJAX-enabled web applications. Toward this end, we introduce OXPath as a superset of XPath 1.0. Beyond XPath, OXPath is able (1) to fill web forms and trigger DOM events, (2) to access dynamically computed CSS attributes, (3) to navigate between visible form fields, and (4) to mark relevant information for extraction. This way, OXPath expressions can closely simulate the human interaction relevant for navigation rather than rely exclusively on the HTML structure. Thus, they are quite resilient against technical changes. We demonstrate the expressiveness and practical efficacy of OXPath to tackle a group flight planning problem. We use the OXPath implementation and visual interface to access the popular, highly-scripted travel site Kayak. We show, both visually and manually, how to formulate OXPath ex- pressions to extract all booking information with just a few lines of code.}, } Although deep web analysis has been studied extensively, there is no succinct formalism to describe user interactions with AJAX-enabled web applications. Toward this end, we introduce OXPath as a superset of XPath 1.0. Beyond XPath, OXPath is able (1) to fill web forms and trigger DOM events, (2) to access dynamically computed CSS attributes, (3) to navigate between visible form fields, and (4) to mark relevant information for extraction. This way, OXPath expressions can closely simulate the human interaction relevant for navigation rather than rely exclusively on the HTML structure. Thus, they are quite resilient against technical changes. We demonstrate the expressiveness and practical efficacy of OXPath to tackle a group flight planning problem. We use the OXPath implementation and visual interface to access the popular, highly-scripted travel site Kayak. We show, both visually and manually, how to formulate OXPath ex- pressions to extract all booking information with just a few lines of code. |
Wang, Cheng; Furche, Tim; Gottlob, Georg; Grasso, Giovanni; Orsi, Giorgio; Schallhart, Christian You Need Only One Clue for Effective Record Segmentation (Inproceeding) International Conference Series on Web Intelligence, Mining and Semantics (WIMS'11), 2011. (BibTeX) @inproceedings{wang2011you, name = {You Need Only One Clue for Effective Record Segmentation}, author = {Wang, Cheng and Furche, Tim and Gottlob, Georg and Grasso, Giovanni and Orsi, Giorgio and Schallhart, Christian}, year = {2011}, date = {2011-01-01}, booktitle = {International Conference Series on Web Intelligence, Mining and Semantics (WIMS'11)}, } |
Sellers, Andrew; Furche, Tim; Gottlob, Georg; Grasso, Giovanni; Schallhart, Christian Exploring the Web with OXPath (Inproceeding) International Workshop on Linked Web Data Management (LWDM'11), 2011. @inproceedings{sellers11:_explor_web_with_oxpat, name = {Exploring the Web with OXPath}, author = {Andrew Sellers and Tim Furche and Georg Gottlob and Giovanni Grasso and Christian Schallhart}, url = {http://dl.acm.org/citation.cfm?id=1966909}, year = {2011}, date = {2011-01-01}, booktitle = {International Workshop on Linked Web Data Management (LWDM'11)}, abstract = {OXPath is a careful extension of XPath that facilitates data extraction from the deep web. It is designed to facilitate the large-scale extraction of data from sophisticated modern web interfaces with client-side scripting and asynchronous server communication. Its main characteristics are (1) a minimal extension of XPath to allow page navigation and action execution, (2) a set-theoretic formal semantics for full OXPath, (3) and a sophisticated memory management that minimizes page buffering. In this poster,we briefly review the main features of the language and discuss ongoing and future work.}, } OXPath is a careful extension of XPath that facilitates data extraction from the deep web. It is designed to facilitate the large-scale extraction of data from sophisticated modern web interfaces with client-side scripting and asynchronous server communication. Its main characteristics are (1) a minimal extension of XPath to allow page navigation and action execution, (2) a set-theoretic formal semantics for full OXPath, (3) and a sophisticated memory management that minimizes page buffering. In this poster,we briefly review the main features of the language and discuss ongoing and future work. |
Furche, Tim; Gottlob, Georg; Grasso, Giovanni; Guo, Xiaonan; Orsi, Giorgio; Schallhart, Christian Real Understanding for Real Estate Forms (Inproceeding) {International Conference on Web Intelligence, Mining and Semantics (WIMS'11)}, 2011. @inproceedings{furche11:_real_under_for_real_estat_forms, name = {Real Understanding for Real Estate Forms}, author = {Tim Furche and Georg Gottlob and Giovanni Grasso and Xiaonan Guo and Giorgio Orsi and Christian Schallhart}, url = {http://www.giovannigrasso.it/wp-content/uploads/2012/02/wims-2011-Real-Understanding-of-Real-Estate-Forms.pdf}, year = {2011}, date = {2011-01-01}, booktitle = {{International Conference on Web Intelligence, Mining and Semantics (WIMS'11)}}, abstract = {Finding an apartment is a lengthy and tedious process. Once decided, one can never be sure not to have missed an even better offer which would have been just one click away. Form understanding is key to automatically access and process all the relevant—and nowadays readily available—data. We introduce opal (ontology-based web pattern analysis with logic), a novel, purely logical approach to web form un- derstanding: opal labels, structures, and groups form fields according to a domain-specific ontology linked through phe- nomenological rules to a logical representation of a DOM. The phenomenological rules describe how ontological con- cepts appear on the web; the ontology formalizes and struc- tures common patterns of web pages observed in a domain. A unique feature of opal is that all domain-independent as- sumptions about web forms are represented in rules, whereas domain-specific assumptions are represented in the ontology. This yields a coherent logical framework, robust in face of changing web trends. We apply opal to a significant, randomly selected sample of UK real estate sites, showing that straightforward rules suffice to achieve high precision form understanding.}, } Finding an apartment is a lengthy and tedious process. Once decided, one can never be sure not to have missed an even better offer which would have been just one click away. Form understanding is key to automatically access and process all the relevant—and nowadays readily available—data. We introduce opal (ontology-based web pattern analysis with logic), a novel, purely logical approach to web form un- derstanding: opal labels, structures, and groups form fields according to a domain-specific ontology linked through phe- nomenological rules to a logical representation of a DOM. The phenomenological rules describe how ontological con- cepts appear on the web; the ontology formalizes and struc- tures common patterns of web pages observed in a domain. A unique feature of opal is that all domain-independent as- sumptions about web forms are represented in rules, whereas domain-specific assumptions are represented in the ontology. This yields a coherent logical framework, robust in face of changing web trends. We apply opal to a significant, randomly selected sample of UK real estate sites, showing that straightforward rules suffice to achieve high precision form understanding. |
Sellers, Andrew; Furche, Tim; Gottlob, Georg; Grasso, Giovanni; Schallhart, Christian OXPath: Little Language, Little Memory, Great Value (Inproceeding) International Conference on Word Wide Web Conference (WWW'11), Page(s): 261--264, 2011. @inproceedings{sellers11:_oxpat, name = {OXPath: Little Language, Little Memory, Great Value}, author = {Andrew Sellers and Tim Furche and Georg Gottlob and Giovanni Grasso and Christian Schallhart}, url = {http://doi.acm.org/10.1145/1963192.1963304}, year = {2011}, date = {2011-01-01}, booktitle = {International Conference on Word Wide Web Conference (WWW'11)}, pages = {261--264}, } |
Furche, Tim; Gottlob, Georg; Grasso, Giovanni; Orsi, Giorgio; Schallhart, Christian; Wang, Cheng Little Knowledge Rules The Web: Domain-Centric Result Page Extraction (Inproceeding) International Conference on Web Reasoning and Rule Systems (RR'11), Page(s): 61--76, 2011. @inproceedings{furche11:_littl_knowl_rules_web, name = {Little Knowledge Rules The Web: Domain-Centric Result Page Extraction}, author = {Tim Furche and Georg Gottlob and Giovanni Grasso and Giorgio Orsi and Christian Schallhart and Cheng Wang}, url = {http://www.giovannigrasso.it/wp-content/uploads/2012/02/rr-2011-Little-Knowledge-Rules-The-Web-Domain-Centric-Result-Page-Extraction.pdf}, year = {2011}, date = {2011-01-01}, booktitle = {International Conference on Web Reasoning and Rule Systems (RR'11)}, pages = {61--76}, abstract = {Web extraction is the task of turning unstructured HTML into structured data. Previous approaches rely exclusively on detecting repeated structures in result pages. These approaches trade intensive user interaction for precision. In this paper, we introduce the Amber (“Adaptable Model-based Extraction of Result Pages”) system that replaces the human interaction with a domain ontology applicable to all sites of a domain. It models domain knowledge about (1) records and attributes of the domain, (2) low-level (textual) representations of these concepts, and (3) constraints linking representations to records and attributes. Parametrized with these constraints, otherwise domain-independent heuristics exploit the repeated structure of result pages to derive attributes and records. Amber is implemented in logical rules to allow an explicit formulation of the heuristics and easy adaptation to different domains.}, } Web extraction is the task of turning unstructured HTML into structured data. Previous approaches rely exclusively on detecting repeated structures in result pages. These approaches trade intensive user interaction for precision. In this paper, we introduce the Amber (“Adaptable Model-based Extraction of Result Pages”) system that replaces the human interaction with a domain ontology applicable to all sites of a domain. It models domain knowledge about (1) records and attributes of the domain, (2) low-level (textual) representations of these concepts, and (3) constraints linking representations to records and attributes. Parametrized with these constraints, otherwise domain-independent heuristics exploit the repeated structure of result pages to derive attributes and records. Amber is implemented in logical rules to allow an explicit formulation of the heuristics and easy adaptation to different domains. |
Grasso, Giovanni; Leone, Nicola; Manna, Marco; Ricca, Francesco ASP at work: spin-off and applications of the DLV system (Incollection) Balduccini, Marcello; Son, Tran Cao (Ed.): Logic programming, knowledge representation, and nonmonotonic reasoning, Springer-Verlag, , ISSN: 978-3-642-20831-7. @incollection{Grasso:2011:AWS:2001078.2001107, name = {ASP at work: spin-off and applications of the DLV system}, author = {Grasso, Giovanni and Leone, Nicola and Manna, Marco and Ricca, Francesco}, editor = {Balduccini, Marcello and Son, Tran Cao}, url = {http://dl.acm.org/citation.cfm?id=2001078.2001107}, issn = {978-3-642-20831-7}, year = {2011}, date = {2011-01-01}, booktitle = {Logic programming, knowledge representation, and nonmonotonic reasoning}, pages = {432--451}, publisher = {Springer-Verlag}, address = {Berlin, Heidelberg}, chapter = {ASP at work: spin-off and applications o}, } |
Ricca,; Grasso,; Alviano,; Manna,; Lio,; Iiritano,; Leone, Team-building with answer set programming in the Gioia-Tauro seaport (Article) Theory and Practice of Logic Programming, 1, 1, Page(s): 1--21, 2011. @article{ricca2011team, name = {Team-building with answer set programming in the Gioia-Tauro seaport}, author = {Ricca, F. and Grasso, G. and Alviano, M. and Manna, M. and Lio, V. and Iiritano, S. and Leone, N.}, url = {doi:10.1017/S147106841100007X}, year = {2011}, date = {2011-01-01}, journal = {Theory and Practice of Logic Programming}, volume = {1}, number = {1}, pages = {1--21}, publisher = {Cambridge Univ Press}, abstract = {The seaport of Gioia Tauro is the largest transshipment terminal of the Mediterranean coast. A crucial management task for the companies operating in the seaport is team-building: the problem of properly allocating the available personnel for serving the incoming ships. Teams have to be carefully arranged in order to meet several constraints, such as allocation of employees with appropriate skills, fair distribution of the working load, and turnover of the heavy/dangerous roles. This makes team-building a hard and expensive task requiring several hours of manual preparation per day. In this paper we present a system based on Answer Set Programming for the automatic generation of the teams of employees in the seaport of Gioia Tauro. The system is currently exploited in the Gioia Tauro seaport by ICO BLG, a company specialized in automobile logistics.}, } The seaport of Gioia Tauro is the largest transshipment terminal of the Mediterranean coast. A crucial management task for the companies operating in the seaport is team-building: the problem of properly allocating the available personnel for serving the incoming ships. Teams have to be carefully arranged in order to meet several constraints, such as allocation of employees with appropriate skills, fair distribution of the working load, and turnover of the heavy/dangerous roles. This makes team-building a hard and expensive task requiring several hours of manual preparation per day. In this paper we present a system based on Answer Set Programming for the automatic generation of the teams of employees in the seaport of Gioia Tauro. The system is currently exploited in the Gioia Tauro seaport by ICO BLG, a company specialized in automobile logistics. |
2010 |
Ricca, Francesco; Dimasi, Antonella; Grasso, Giovanni; Ielpa, Salvatore Maria; Iiritano, Salvatore; Manna, Marco; Leone, Nicola A Logic-Based System for e-Tourism (Article) Fundamenta Informaticae, 105, 1, Page(s): 35--55, 2010. @article{DBLP:journals/fuin/RiccaDGIIML10, name = {A Logic-Based System for e-Tourism}, author = {Francesco Ricca and Antonella Dimasi and Giovanni Grasso and Salvatore Maria Ielpa and Salvatore Iiritano and Marco Manna and Nicola Leone}, url = {http://dx.doi.org/10.3233/FI-2010-357}, year = {2010}, date = {2010-01-01}, journal = {Fundamenta Informaticae}, volume = {105}, number = {1}, pages = {35--55}, publisher = {IOS Press}, } |
Grasso, Giovanni; Iiritano, Salvatore; Leone, Nicola; Lio, Vincenzino; Ricca, Francesco; Scalise, Francesco An ASP-Based System for Team-Building in the Gioia-Tauro Seaport (Incollection) Carro, Manuel; Peña, Ricardo (Ed.): Practical Aspects of Declarative Languages, Springer Berlin / Heidelberg, , ISSN: 978-3-642-11502-8. @incollection{springerlink:10.1007/978-3-642-11503-5_5, name = {An ASP-Based System for Team-Building in the Gioia-Tauro Seaport}, author = {Grasso, Giovanni and Iiritano, Salvatore and Leone, Nicola and Lio, Vincenzino and Ricca, Francesco and Scalise, Francesco}, editor = {Carro, Manuel and Peña, Ricardo}, url = {http://dx.doi.org/10.1007/978-3-642-11503-5_5}, issn = {978-3-642-11502-8}, year = {2010}, date = {2010-01-01}, booktitle = {Practical Aspects of Declarative Languages}, volume = {5937}, pages = {40-42}, publisher = {Springer Berlin / Heidelberg}, series = {Lecture Notes in Computer Science}, } |
2009 |
Ricca,; Gallucci,; Schindlauer,; Dell’Armi,; Grasso,; Leone, OntoDLV: an ASP-based system for enterprise ontologies (Article) Journal of Logic and Computation, 19, 4, Page(s): 643--670, 2009. @article{ricca2009ontodlv, name = {OntoDLV: an ASP-based system for enterprise ontologies}, author = {Ricca, F. and Gallucci, L. and Schindlauer, R. and Dell’Armi, T. and Grasso, G. and Leone, N.}, url = {http://dx.doi.org/10.1093/logcom/exn042}, year = {2009}, date = {2009-01-01}, journal = {Journal of Logic and Computation}, volume = {19}, number = {4}, pages = {643--670}, publisher = {Oxford Univ Press}, } |
Grasso, Giovanni; Iiritano, Salvatore; Leone, Nicola; Ricca, Francesco Some DLV Applications for Knowledge Management (Incollection) Erdem, Esra; Lin, Fangzhen; Schaub, Torsten (Ed.): Logic Programming and Nonmonotonic Reasoning, Springer Berlin / Heidelberg, , ISSN: 978-3-642-04237-9. @incollection{springerlink:10.1007/978-3-642-04238-6_63, name = {Some DLV Applications for Knowledge Management}, author = {Grasso, Giovanni and Iiritano, Salvatore and Leone, Nicola and Ricca, Francesco}, editor = {Erdem, Esra and Lin, Fangzhen and Schaub, Torsten}, url = {http://dx.doi.org/10.1007/978-3-642-04238-6_63}, issn = {978-3-642-04237-9}, year = {2009}, date = {2009-01-01}, booktitle = {Logic Programming and Nonmonotonic Reasoning}, volume = {5753}, pages = {591-597}, publisher = {Springer Berlin / Heidelberg}, series = {Lecture Notes in Computer Science}, } |
2005 |
Ricca, Francesco; Leone, Nicola; De Bonis, Valerio; Dell'Armi, Tina; Galizia, Stefania; Grasso, Giovanni A DLP system with object-oriented features (Inproceeding) Proceedings of the 8th international conference on Logic Programming and Nonmonotonic Reasoning, Page(s): 432--436, Berlin, Heidelberg, Springer-Verlag, 2005, ISSN: 3-540-28538-5, 978-3-540-28538-0. @inproceedings{Ricca:2005:DSO:2081012.2081063, name = {A DLP system with object-oriented features}, author = {Ricca, Francesco and Leone, Nicola and De Bonis, Valerio and Dell'Armi, Tina and Galizia, Stefania and Grasso, Giovanni}, url = {http://dx.doi.org/10.1007/11546207_41}, issn = {3-540-28538-5, 978-3-540-28538-0}, year = {2005}, date = {2005-01-01}, booktitle = {Proceedings of the 8th international conference on Logic Programming and Nonmonotonic Reasoning}, pages = {432--436}, publisher = {Springer-Verlag}, address = {Berlin, Heidelberg}, series = {LPNMR'05}, } |
-
Contact
giovannigrasso@gmail.com
office: +44 (0)1865 610648skype: grassogio
Networks
linkedin, twitter
academia.eu, facebook,