Python爬虫爬取Google Play 100万个App的数据，并入库到数据库 scrapy框架|电子爱好者

admin管理员组
文章数量:1604637

代码目录结构

相关文件代码

google.py爬虫主要代码

# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.linkextractors import LinkExtractor
from app.items import GoogleItem
from language_linkextractor import LanguageLinkExtractor
import urlparse
import sys

class GoogleSpider(CrawlSpider):
reload(sys)
sys.setdefaultencoding('utf-8')
name = "google"
allowed_domains = ["play.google"]
start_urls = (
'http://play.google/',
'https://play.google/store/apps/details?id=me.ele'
)
rules = [
Rule(LanguageLinkExtractor(allow=("/store/apps/details", )), callback='parse_app',follow=True),
] #

def parse_app(self, response):
# 在这里只获取页面的 URL 以及下载数量
item = GoogleItem()
# item['url'] = response.url
r = urlparse.urlparse(response.url);
params = urlparse.parse_qs(r.query, True);
item['package'] = ','.join(params['id']);
item['num'] = response.xpath("//div[@itemprop='numDownloads']").xpath("text()").extract()
item['score'] = response.xpath("//div[@class='score']").xpath("text()").extract()
item['review'] = response.xpath("//span[@class='reviews-num']").xpath("text()").extract()
item['company'] = response.xpath("//span[@itemprop='name']").xpath("text()").extract()
item['category'] = response.xpath("//span[@itemprop='genre']").xpath("text()").extract()
item['name'] = response.xpath("//div[@class='id-app-title']").xpath("text()").extract()
yield item

language_linkextractor.py解决语言乱码问题

from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor

class LanguageLinkExtractor(LxmlLinkExtractor):
# def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
# canonicalize=True,
# unique=True, process_value=None, deny_extensions=None, restrict_css=()):
# super(LxmlLinkExtractor, self).__init__(allow=allow, deny=deny,
# allow_domains=allow_domains, deny_domains=deny_domains,
# restrict_xpaths=restrict_xpaths, canonicalize=canonicalize,
# deny_extensions=deny_extensions, restrict_css=restrict_css)
@staticmethod
def addParams(url):
if url.find('?') >= 0:
return url+'&hl=en';
else:
return url +'?hl=en';

def extract_links(self, response):
links = LxmlLinkExtractor.extract_links(self, response);
for x in links:
x.url = LanguageLinkExtractor.addParams(x.url)
# links = super(LxmlLinkExtractor, self).extract_links(response);
return links;

items.py相关item

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy/en/latest/topics/items.html

import scrapy
class GoogleItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# url = scrapy.Field()
num = scrapy.Field()
package = scrapy.Field();
score = scrapy.Field();
review = scrapy.Field();
company = scrapy.Field();
category = scrapy.Field();
name = scrapy.Field();

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy/en/latest/topics/item-pipeline.html

class AppPipeline(object):
def process_item(self, item, spider):
return item

setting.py

# -*- coding: utf-8 -*-

# Scrapy settings for app project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy/en/latest/topics/settings.html
# http://scrapy.readthedocs/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs/en/latest/topics/spider-middleware.html

BOT_NAME = 'app'

SPIDER_MODULES = ['app.spiders']
NEWSPIDER_MODULE = 'app.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'app (+http://www.yourdomain)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'app.middlewares.MyCustomSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'app.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'app.pipelines.SomePipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# ITEM_PIPELINES = {
# 'scrapy_mongodb.MongoDBPipeline': 100
# }

# MONGODB_URI = 'mongodb://127.0.0.1:27017'
# MONGODB_DATABASE = 'scrapy'
# MONGODB_COLLECTION = 'play'
FEED_URL='google_play.csv'
FEED_FORMAT='csv'

scrapy.cfg

# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs/en/latest/deploy.html

[settings]
default = app.settings

[deploy]
#url = http://localhost:6800/
project = app

本文标签：爬虫万个框架数据库数据

版权声明：本文标题：Python爬虫爬取Google Play 100万个App的数据，并入库到数据库 scrapy框架内容由热心网友自发贡献，该文观点仅代表作者本人，转载请联系作者并注明出处：https://m.elefans.com/dianzi/1728465667a1159420.html，本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容，一经查实，本站将立刻删除。

更多相关文章

xp系统

电子爱好者 - 最新技术资讯及电子产品介绍！

Python爬虫爬取Google Play 100万个App的数据，并入库到数据库 scrapy框架

更多相关文章

XAPP585框架详解-LVDS时钟恢复逻辑

专业解析：移动硬盘“要求格式化”背后的真相与数据救援策略

移动硬盘分区后数据还能恢复吗？

移动硬盘损坏怎么恢复数据？对症恢复更有效

如何恢复RAW格式移动硬盘的数据

spyder怎么显示文件目录_移动硬盘打不开，数据怎么恢复？

移动硬盘损坏如何恢复数据

KylinV10（银河麒麟国产操作系统）的安装及达梦数据库的安装

【大数据实训】基于Hadoop的2019年11月至2020年2月宁波天气数据分析（五）

Spring Boot、SpringCloud框架

Android 神器 xposed 框架使用指南

2017第三届美亚杯全国电子数据取证大赛个人赛write up

传输协议不安全，数据泄露谁之过？——流量劫持技术分析

AutoCad-查看坐标数据-画多段线命令-画圆命令-画文字-VBA

获取Google Play 下载 来源渠道 广告参数 相关数据

使用play-scraper进行Google Play数据抓取教程

Google Play Store谷歌应用商店游戏数据分析

Java 爬虫-谷歌商店(Google play)--应用的版本号

google服务框架 闪退_Google Play闪退怎么办 怎么用RE管理器解决闪退问题

NLP+VS︱深度学习数据集标注工具、图像语料数据库、实验室搜索ing...

发表评论

推荐文章

Windows蓝屏了，如何处理？

Windows系统下VirtualBox新建Ubuntu虚拟机并采用WindTerm和XShell远程连接（2022.5.11）

计算机主机发出滴滴声音,为什么Win7电脑总是发出滴滴滴声？出现的原因及解决方法...

AutoCad-查看坐标数据-画多段线命令-画圆命令-画文字-VBA

一起Talk Android吧（第五百四十六回：如何判断手机是否安装GooglePlay）

热门文章

辣椒app软件测试,辣椒视频(test flight)官方版

win10公网远程桌面工具 ShNat 实现原理和使用教程

SSH&amp;RSYNC之SSH

解决window10系统电脑插入耳机之后没有声音的问题

vm15虚拟机没声音常见解决方法与设置了声卡也没声音

番茄花园 Ghost XP SP3 2013 电脑城极速装机版

cad怎么表示出一个孔_AutoCAD如何画一个带孔的立体球

在线CAD如何配合three.js绘制带线宽的线段

JustView(Office、CAD、三维模型)在线预览平台调用说明

Google Play创建商品，手机支付，订单管理完整流程

最新文章

语音转文字软件哪个好用？分享职场达人都在用的宝藏软件

22款神奇的Ubuntu软件

同声传译软件哪个好用？同声传译工具带你开启高效沟通新篇章

商品搜索引擎---分词（插件介绍与入门实例）

卸载Notepad++！事实证明，它更牛逼~

语音翻译在线语音翻译哪个好？详解语音翻译效果

ubuntu个人实用配置问题

PPT设计思维 - 邵云蛟

【新机】配置记录

Linux软件编程---Linux基础---标准IO常用函数接口

Ubuntu 必装软件及安装教程

手机图片文字翻译如何操作？4招告诉你

Ubuntu（Linux）上安装微信（windows应用）

Suiblime

嵌入式学习

小米手机肿么还原时钟

15000流明是多少瓦

一般普通投影机功率多大?

苹果绿联转换器有些投影机不能用

坚果V9投影机具体参数?

有关九年级作文850字精选

80后90后_高一作文

中级卫生专业资格中医全科学主治医师中级模拟题2021年(9)案与解析

(精品)师范大学招考硕士研究生课程八六0试卷

ZXMVC8900(V3

【模拟人生4（The Sims 4）性感露背黑色亮片礼服MOD V20190313】模拟人生4（The Sims 4）性感露背黑色亮片礼服MOD V20190313 官方免费下载

【生化危机2：重制版（Resident Evil 2 Remake）克莱尔红头发深色服装MOD】生化危机2：重制版（Resident Evil 2 Remake）克莱尔红头发深色服装MOD 官方免费下载

【模拟人生4（The Sims 4）性感露背深V领吊带裙MOD V20190311】模拟人生4（The Sims 4）性感露背深V领吊带裙MOD V20190311 官方免费下载

【模拟人生4（The Sims 4）科幻风宇宙飞船家庭住宅MOD V20190311】模拟人生4（The Sims 4）科幻风宇宙飞船家庭住宅MOD V20190311 官方免费下载

【鬼泣5（Devil May Cry V）v1.0十四项修改】鬼泣5（Devil May Cry V）v1.0十四项修改 官方免费下载

如何实现高效的treenode搜索算法

treenode与链表有何本质区别

在哪些场景下应优先考虑使用treenode

treenode在树形结构中的角色是什么

如何通过treenode实现二叉树

获取Google Play 下载来源渠道广告参数相关数据

google服务框架闪退_Google Play闪退怎么办怎么用RE管理器解决闪退问题

SSH&RSYNC之SSH

【鬼泣5（Devil May Cry V）v1.0十四项修改】鬼泣5（Devil May Cry V）v1.0十四项修改官方免费下载