Skip to main content
All CollectionsDeveloping Actors
Scraping data from websites using schema.org Microdata
Scraping data from websites using schema.org Microdata

JavaScript code to automatically extract data using schema.org tags

Hany Duong avatar
Written by Hany Duong
Updated over 2 months ago

Apify users sometimes need to extract data from websites that use schema.org Microdata attributes to describe the meaning of their HTML content. Clearly, there must be some simple way to extract structured data from such HTML purely using these schema.org tags. Well, look no further. Simply use the following JavaScript code for your Web Scraper page function to automatically extract the data:

async function pageFunction(context) {    
const extractValue = function(elem) {
return $(elem).attr("content") || $(elem).text()
|| $(elem).attr("src") || $(elem).attr("href") || null;
};
const addProperty = function(item, propName, value) {
if (typeof(value)==='string')
value = value.trim();
if (Array.isArray(item[propName]))
item[propName].push(value);
else if (typeof(item[propName])!=='undefined')
item[propName] = [item[propName], value];
else
item[propName] = value;
}
const extractItem = function(elem) {
let item = { _type: $(elem).attr("itemtype") };
let count = 0;
// iterate itemprops not nested in another itemscope
$(elem).find("[itemprop]").filter(function() {
return $(this).parentsUntil(elem, '[itemscope]').length === 0;
}).each( function() {
addProperty(
item,
$(this).attr("itemprop"),
$(this).is("[itemscope]")
? extractItem(this)
: extractValue(this)
);
count++;
});
// special case - output at least something
if( count===0 )
addProperty(item, "_value", extractValue(elem));
return item;
};
const extractAllItems = function() {
const items = [];
// find top-level itemscope elements
$("[itemscope]").filter(function() {
return $(this).parentsUntil("body", '[itemscope]').length === 0;
}).each( function() {
items.push( extractItem(this) );
});
return items;
};
return extractAllItems();
}

For a standalone Actor with Puppeteer, don't forget to inject JQuery and call this function inside page.evaluate:

// Some initial code to open the page
await Apify.utils.puppeteer.injectJQuery(page);
const data = await page.evaluate(pageFunction);
// profit!

For example, when crawling the IMDB entry for the movie Inception, the page function will give you the following results:

[{
"_type": "http://schema.org/Movie",
"aggregateRating": {
"_type": "http://schema.org/AggregateRating",
"ratingValue": "8.8",
"bestRating": "10",
"ratingCount": "1,507,540",
"reviewCount": [
"2,825 user",
"647 critic"
]
},
"name": "Inception (2010)",
"contentRating": [
"PG-13",
"Rated PG-13 for sequences of violence and action throughout"
],
"duration": [
"2h 28min",
"148 min"
],
"genre": [
"Action",
"Adventure",
"Sci-Fi",
"Genres:\n Action |\n Adventure |\n Sci-Fi |\n Thriller"
],
"datePublished": [
"2010-07-16",
"17 November 2016 5:09 PM, -05:00",
"14 November 2016 4:03 PM, -05:00",
"14 November 2016 9:46 AM, -05:00"
],
"image": [
"https://images-na.ssl-images-amazon.com/images/M/MV5BMjAxMzY3NjcxNF5BMl5BanBnXkFtZTcwNTI5OTM0Mw@@._V1_UX182_CR0,0,182,268_AL_.jpg",
"http://ia.media-imdb.com/images/M/MV5BMjI2NDI3NTA4M15BMl5BanBnXkFtZTgwMDQ5NjYzMzE@._V1_CR167,0,946,532_AL_UY268_CR84,0,477,268_AL_.jpg",
"https://images-na.ssl-images-amazon.com/images/M/MV5BMDAzNDAxOGItNWFlMC00NjE1LTk2ZjctYzhiYTEzMjYxNzdiXkEyXkFqcGdeQXVyNTY0MTkxMTg@._CR241,39,1153,864_UX614_UY460._SY230_SX307_AL_.jpg",
"https://images-na.ssl-images-amazon.com/images/G/01/imdb/images/widget/amazon._CB339202444_.png",
"https://images-na.ssl-images-amazon.com/images/M/MV5BMTM5MjMyMTgxNl5BMl5BanBnXkFtZTcwOTM0OTk1Mw@@._V1_UY105_CR26,0,105,105_AL_.jpg",
"https://images-na.ssl-images-amazon.com/images/M/MV5BMTM0MjUzNjkwMl5BMl5BanBnXkFtZTcwNjY0OTk1Mw@@._V1_UX105_CR0,0,105,105_AL_.jpg",
"https://images-na.ssl-images-amazon.com/images/M/MV5BMjExMjkwNTQ0Nl5BMl5BanBnXkFtZTcwNTY0OTk1Mw@@._V1_UX105_CR0,0,105,105_AL_.jpg"
],
"trailer": "",
"description": [
"A thief, who steals corporate secrets through use of dream-sharing technology, is given the inverse task of planting an idea into the mind of a CEO.",
"Dom Cobb is a skilled thief, the absolute best in the dangerous art of extraction, stealing valuable secrets from deep within the subconscious during the dream state, when the mind is at its most vulnerable. Cobb's rare ability has made him a coveted player in this treacherous new world of corporate espionage, but it has also made him an international fugitive and cost him everything he has ever loved. Now Cobb is being offered a chance at redemption. One last job could give him his life back but only if he can accomplish the impossible - inception. Instead of the perfect heist, Cobb and his team of specialists have to pull off the reverse: their task is not to steal an idea but to plant one. If they succeed, it could be the perfect crime. But no amount of careful planning or expertise can prepare the team for the dangerous enemy that seems to predict their every move. An enemy that only Cobb could have seen coming. Written by\nWarner Bros. Pictures"
],
"director": {
"_type": "http://schema.org/Person",
"url": "Christopher Nolan",
"name": "Christopher Nolan"
},
"creator": [
{
"_type": "http://schema.org/Person",
"url": "Christopher Nolan",
"name": "Christopher Nolan"
},
{
"_type": "http://schema.org/Organization",
"url": "Warner Bros.",
"name": "Warner Bros."
},
{
"_type": "http://schema.org/Organization",
"url": "Legendary Entertainment",
"name": "Legendary Entertainment"
},
{
"_type": "http://schema.org/Organization",
"url": "Syncopy",
"name": "Syncopy"
}
],
"actors": [
{
"_type": "http://schema.org/Person",
"url": "Leonardo DiCaprio",
"name": "Leonardo DiCaprio"
},
{
"_type": "http://schema.org/Person",
"url": "Joseph Gordon-Levitt",
"name": "Joseph Gordon-Levitt"
},
{
"_type": "http://schema.org/Person",
"url": "Ellen Page",
"name": "Ellen Page"
}
],
"headline": [
"Why Eddie Redmayne Isn’t Worried That ‘Fantastic Beasts’ Expanded to Five Films",
"‘Doctor Strange’: How VFX Tapped ‘Inception’ for a New Marvel Dimension",
"The Crown review: a mesh of history books and melodrama"
],
"provider": [
"Variety - Film News",
"Indiewire",
"Den of Geek"
],
"url": [
"Show HTML",
"View more styles",
"Official Facebook",
"Warner Bros. [Germany]",
"See more",
"USA",
"UK",
"English",
"Japanese",
"French",
"See more",
"See more",
"Bedfordshire, England, UK",
"See more",
"See more",
"See more",
"company contact information",
"IMDbPro",
"Dolby Digital",
"DTS",
"SDDS",
"Color",
"full technical specs"
],
"awards": [
"Won\n 4\n Oscars.",
"Another\n 144 wins & 198 nominations."
],
"thumbnailUrl": [
"/title/tt1375666/mediaviewer/rm3636364032?context=default&ref_=tt_pv_md_1",
"/title/tt1375666/mediaviewer/rm918520576?context=default&ref_=tt_pv_md_2",
"/title/tt1375666/mediaviewer/rm935297792?context=default&ref_=tt_pv_md_3"
],
"actor": [
{
"_type": "http://schema.org/Person",
"url": "Leonardo DiCaprio",
"name": "Leonardo DiCaprio"
},
{
"_type": "http://schema.org/Person",
"url": "Joseph Gordon-Levitt",
"name": "Joseph Gordon-Levitt"
},
{
"_type": "http://schema.org/Person",
"url": "Ellen Page",
"name": "Ellen Page"
},
{
"_type": "http://schema.org/Person",
"url": "Tom Hardy",
"name": "Tom Hardy"
},
{
"_type": "http://schema.org/Person",
"url": "Ken Watanabe",
"name": "Ken Watanabe"
},
{
"_type": "http://schema.org/Person",
"url": "Dileep Rao",
"name": "Dileep Rao"
},
{
"_type": "http://schema.org/Person",
"url": "Cillian Murphy",
"name": "Cillian Murphy"
},
{
"_type": "http://schema.org/Person",
"url": "Tom Berenger",
"name": "Tom Berenger"
},
{
"_type": "http://schema.org/Person",
"url": "Marion Cotillard",
"name": "Marion Cotillard"
},
{
"_type": "http://schema.org/Person",
"url": "Pete Postlethwaite",
"name": "Pete Postlethwaite"
},
{
"_type": "http://schema.org/Person",
"url": "Michael Caine",
"name": "Michael Caine"
},
{
"_type": "http://schema.org/Person",
"url": "Lukas Haas",
"name": "Lukas Haas"
},
{
"_type": "http://schema.org/Person",
"url": "Tai-Li Lee",
"name": "Tai-Li Lee"
},
{
"_type": "http://schema.org/Person",
"url": "Claire Geare",
"name": "Claire Geare"
},
{
"_type": "http://schema.org/Person",
"url": "Magnus Nolan",
"name": "Magnus Nolan"
}
],
"keywords": [
"Plot Keywords:\n dream\n |\n subconscious\n |\n ambiguous ending\n |\n thief\n |\n architecture\n | See All (283) »",
"dream",
"subconscious",
"ambiguous ending",
"thief",
"architecture"
],
"audience": {
"_type": "http://schema.org/Audience",
"url": "View content advisory"
},
"review": {
"_type": "http://schema.org/Review",
"name": "Sci-fi perfection. A truly mesmerizing film.",
"reviewRating": {
"_type": "http://schema.org/Rating",
"worstRating": "1",
"ratingValue": "8",
"bestRating": "10"
},
"author": "dvc5159",
"datePublished": "2010-07-15",
"reviewBody": "I'm nearly at a loss for words. Just when you thought Christopher Nolan couldn't follow up to \"The Dark Knight\", he does it again, delivering another masterpiece, one with so much power and rich themes that has been lost from the box office for several years..."
}
}]
Did this answer your question?