一,代码
分析网页dom数据,列出章节列表
var http = require('http')var cheerio = require('cheerio')var url = 'http://www.imooc.com/learn/348' function filterChapters(html){var $ = cheerio.load(html)var chapters = $('.chapter')// [{// chapterTitle: '',// videos: [// title: '',// id: ''// ]// }]var courseData = []
chapters.each(function(item){var chapter = $(this)var chapterTitle = chapter.find('h3').text()var videos = chapter.find('.video').children('li')var chapterData = {
chapterTitle: chapterTitle,
videos:[]
}
videos.each(function(item){var video = $(this).find('.J-media-item')var videoTitle = video.text()var id = video.attr('href').split('video/')[1]
chapterData.videos.push({
title: videoTitle,
id: id
})
})
courseData.push(chapterData)
})return courseData
}function printCourseInfo(courseData){courseData.forEach(function(item){var chapterTitle = item.chapterTitle
console.log(chapterTitle + '
')
item.videos.forEach(function(video){console.log('【' +video.id + '】' + video.title + '
')
})
})
}
http.get(url, function(res){var html = ''res.on('data', function(data){html += data
})
res.on('end', function(){var courseData = filterChapters(html)
printCourseInfo(courseData)
})
}).on('error', function(){console.log('获取课程数据出差错')
})二,安装模块
cheerio是一个相当于jquery的模块
npm install cheerio
三,执行
node crawler.js
四,注意
1,forEach和each
要知道forEach和each的区别,你必须明白一点:forEach是js中的方法(针对数组),而each是jquery中的方法(针对jquery对象,即$( ) )。