安装Python

安装Python依赖

1
2
# CentOS7
yum install -y gcc gcc-c++ make git patch openssl-devel zlib-devel readline-devel sqlite sqlite-devel bzip2-devel curl wget ncurses-devel sqlite-devel gdbm-devel xz-devel tk-devel

安装pyenv

1
git clone https://github.com/pyenv/pyenv.git ~/.pyenv

配置bash_profile

1
2
3
4
5
6
echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc
echo 'export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc

echo -e 'if command -v pyenv 1>/dev/null 2>&1; then\n eval "$(pyenv init -)"\nfi' >> ~/.bashrc

exec "$SHELL"

安装Python

1
2
3
4
5
pyenv install 3.6.5 -v

pyenv global 3.6.5

pyenv rehash

/root/.pyenv/versions/3.6.5/bin/python

1
pip install virtualenvwrapper pipenv
1
2
3
4
5
6
7
vim ~/.bashrc

#start virtualwrapper
VIRTUALENVWRAPPER_PYTHON=/root/.pyenv/versions/3.6.5/bin/python
export WORKON_HOME='~/.virtualenv'
source /root/.pyenv/versions/3.6.5/bin/virtualenvwrapper.sh
#end

pipenv –python=/root/.pyenv/versions/3.6.5/bin/python

因Supervisor现在不支持python3的版本需要安装Python 2.7.14

pyenv install 2.7.14 -v
pyenv global 2.7.14
pyenv rehash

更改pypi源

1
2
3
4
5
6
7
# 新建pip.conf存放目录
mkdir ~/.pip&&cd ~/.pip

cat >pip.conf<<EOF
[global]
index-url = https://pypi.douban.com/simple
EOF

安装Supervisor Supervisor使用简介

新建虚拟环境

1
2
cd /usr/local/
mkdir py2 py3
1
2
3
4
cd /usr/local/py2
pipenv --python=/root/.pyenv/versions/2.7.14/bin/python

workon py2
1
pyenv virtualenv 2.7.13 supervisor

激活虚拟环境

1
source /root/.pyenv/versions/2.7.13/envs/supervisor/bin/activate

安装supervisor

1
pip install supervisor
1
2
3
mkdir -p /etc/supervisor/

echo_supervisord_conf > /etc/supervisord.conf
1
echo_supervisord_conf > /etc/supervisor/supervisord.conf
1
2
3
4
vim /etc/supervisord.conf

[include]
files = /etc/supervisor/conf.d/*.conf
1
/root/.virtualenv/py2-kYuLH55n/bin/supervisord -c /etc/supervisord.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
vim /usr/lib/systemd/system/supervisord.service

[Unit]
Description=supervisord - Supervisor process control system for UNIX
Documentation=http://supervisord.org
After=network.target

[Service]
Type=forking
ExecStart=/root/.virtualenv/py2-kYuLH55n/bin/supervisord -c /etc/supervisord.conf
ExecReload=/root/.virtualenv/py2-kYuLH55n/bin/supervisorctl reload
ExecStop=/root/.virtualenv/py2-kYuLH55n/bin/supervisorctl shutdown

[Install]
WantedBy=multi-user.target

启动supervisor

1
2
3
4
5
6
7
8
9
# 查看进程
ps aux | grep supervisord

systemctl daemon-reload
systemctl enable supervisord
systemctl start supervisord
systemctl status supervisord -l
systemctl stop supervisord
systemctl reload supervisord

Scrapyd

新建虚拟环境

1
2
3
4
cd /usr/local/py3
pipenv --python=/root/.pyenv/versions/3.6.5/bin/python

workon py3-hv-Nhry6
1
2
3
4
5
6
7
pyenv virtualenv 3.6.2 scrapyd

source /root/.pyenv/versions/3.6.2/envs/scrapyd/bin/activate

pyenv activate scrapyd

pyenv deactivate

安装scrapyd

1
pip install scrapyd

mkdir -p /data/scrapyd

配置scrapyd

/etc/supervisor/scrapyd.conf

1
2
3
4
5
6
7
8
9
cat >/etc/supervisor/scrapyd.conf<<EOF
[program:scrapyd]
command=workon py3-hv-Nhry6
directory=/data/scrapyd
command=/root/.virtualenv/py3-hv-Nhry6/bin/scrapyd
autostart=true
autorestart=true
redirect_stderr=true
EOF

重启supervisor

1
2
3
4
5
/root/.virtualenv/py2-kYuLH55n/bin/supervisorctl

status

reread|reload

安装SipderKeeper

安装

1
2
3
source /root/.pyenv/versions/3.6.2/envs/scrapyd/bin/activate

pip install spiderkeeper

mkdir -p /data/spiderkeeper

配置spiderkeeper

/etc/supervisor/spiderkeeper.conf

1
2
3
4
5
6
7
8
9
cat >/etc/supervisor/spiderkeeper.conf<<EOF
[program:spiderkeeper]
command=source /root/.pyenv/versions/3.6.2/envs/scrapyd/bin/activate
directory=/data/spiderkeeper
command=/root/.virtualenv/py3-hv-Nhry6/bin/spiderkeeper --port=8000 --server=http://localhost:6800 --username=admin --password=admin
autostart=true
autorestart=true
startretries=3
EOF

重启supervisor

1
2
3
4
5
/root/.pyenv/versions/supervisor/bin/supervisorctl

status

reread|reload

###

网络设置

1
vim /Library/Preferences/VMware\ Fusion/networking

answer VNET_1_HOSTONLY_NETMASK 255.255.255.0
answer VNET_1_HOSTONLY_SUBNET 192.168.2.0
answer VNET_8_HOSTONLY_NETMASK 255.255.255.0
answer VNET_8_HOSTONLY_SUBNET 192.168.5.0

安装 macOS 系统

下载 安装 macOS Mojave.app

问题

找不到可以连接的有效对等进程

关闭 Virtualbox

1
sudo /Library/Application\ Support/VirtualBox/LaunchDaemons/VirtualBoxStartup.sh stop

参考

给VMWare Fusion设置固定IP

  • 在Github上申请token

Github Personal access tokens

  • pycharm 设置github token
    Preferences->Version Control->GIthub

  • pycharm上新建project,名字 python_crawler。

1
2
3
4
5
cd /Users/jinlong/PycharmProjects/python_crawler

pipenv --python=/Users/jinlong/.pyenv/versions/3.6.3/bin/python

sed -i "" "s/python.org/douban.com/g" Pipfile
  • 设置pycharm的python解释器
    Preferences->Project:python_crawler->Project Interpreter

  • 设置pycharm下的github

VCS->Import into Version Control->Share Project On Github

PyCharm 配置

  • Tab键设置成4个空格
    Preferences->Editor->Code Style->Python->Use tab character 取消勾选

  • 调整字母长度分割线
    Preferences->Editor->Code Style->Right margin(columns) 设置为80

  • 文件模板
    Preferences->Editor->File and Code Templates

    Enable Live Templates

1
2
3
4
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'ovwane'
__date__ = '$DATE $HOUR:$MINUTE'

Pycharm安装、设置、优化

Google 开源项目风格指南-Python 风格指南 - 内容目录

PyCharm插件

Mongo Plugin

Markdown Navigator
Markdown support

Python工具

nmap
pip install python-nmap

安装Python

安装Python依赖

1
2
# CentOS7
yum install -y gcc gcc-c++ make git patch openssl-devel zlib-devel readline-devel sqlite sqlite-devel bzip2-devel curl wget ncurses-devel sqlite-devel gdbm-devel xz-devel tk-devel

安装pyenv

1
git clone https://github.com/pyenv/pyenv.git ~/.pyenv

配置bash_profile

1
2
3
4
5
6
echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc
echo 'export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc

echo -e 'if command -v pyenv 1>/dev/null 2>&1; then\n eval "$(pyenv init -)"\nfi' >> ~/.bashrc

exec "$SHELL"

安装Python

1
2
3
4
5
pyenv install 3.6.5 -v

pyenv global 3.6.5

pyenv rehash

/root/.pyenv/versions/3.6.5/bin/python

1
pip install virtualenvwrapper pipenv
1
2
3
4
5
6
7
vim ~/.bashrc

#start virtualwrapper
VIRTUALENVWRAPPER_PYTHON=/root/.pyenv/versions/3.6.5/bin/python
export WORKON_HOME='~/.virtualenv'
source /root/.pyenv/versions/3.6.5/bin/virtualenvwrapper.sh
#end

pipenv –python=/root/.pyenv/versions/3.6.5/bin/python

因Supervisor现在不支持python3的版本需要安装Python 2.7.13

pyenv install 2.7.14 -v
pyenv global 2.7.14
pyenv rehash

更改pypi源

1
2
3
4
5
6
7
# 新建pip.conf存放目录
mkdir ~/.pip&&cd ~/.pip

cat >pip.conf<<EOF
[global]
index-url = https://pypi.douban.com/simple
EOF

求职

新人到底需要什么?

Python 爬虫学到什么样就可以找工作了?

我是如何从自学编程到找到工作的

有多少人按@萧井陌大神给出的Python+Flask路线找到工作了?

现在看的爬虫代码,学会了就可以找工作了

Python爬虫面试题

关于Python的面试题

Python爬虫面试(2017.09.18)

Python面试与分析

python工程师四家公司面试题

我的面试之旅

python爬虫相关

Python 爬虫的工具列表( 附Github代码下载链接)

Python入门网络爬虫之精华版

Systematically_self-study_Python

hackerxu

博客

Python

胡阳
the5fire的技术博客

廖雪峰
廖雪峰的官方网站

董伟明
小明明s à domicile | Python之美

Alex Li
金角大王等待唐僧的日子-老男孩教育Python自动化2.0课程课件目录

零度
零度
零度-文章-爬虫

Ele - A面

python爬虫&&数据挖掘

十四君
URl-team python爬虫&&数据挖掘

崔庆才
静觅 崔庆才的个人博客
崔庆才的个人博客-hellobi

州的先生
州的先生

Ehco Blog
Ehco Blog

Authors: A. Jesse Jiryu Davis and Guido van Rossum
aosabook/500lines

数据挖掘

宁哥的小站

#HTTP
HTTP POST参数

Python爬虫传送post请求要携带哪些参数?

60秒GET小技能-爬虫快速构建post参数法

#破解
破解极验(geetest)验证码 - CSDN博客

#数据可视化
有趣的数据可视化 - 简书

#人工
SnailTyan
SnailTyan

Python文档

Python一译中文文档

Python小技巧

python奇技淫巧

Python 3.X 里不包含字典类型的has_key() 函数,被 contains(key) 替代

Django

Nyloner/NyBlog

Python FAQ

一次抓虫引发的对python导入机制的初步认识

编程指南

编程入门指南 v1.5

知乎萧井陌大神《编程入门指南v1.3》思维导图

Python协程

使用Python进行并发编程-asyncio篇(一)

aio-libs

A Web Crawler With asyncio Coroutines
A. Jesse Jiryu Davis and Guido van Rossum

性能调试

Python web 应用性能调优

http服务跟踪及调试工具

Pyflame: A Ptracing Profiler For Python

FAQ

Supervisor 的问题 minfds

Resin+Nginx动静分离和负载均衡

Resin+Nginx动静分离和负载均衡

案例:目前很多人喜欢Nginx+tomcat动静分离,或者反代后端tomcat集群,不过很多人也喜欢用Resin。
本人花了些功夫总结了Resin和tomcat区别:

特性\容器 resin tomcat
公司 CAUCHO Apache
是否收费 不完全免费(pro版本收费) 完全免费
Eclipse下调试开发 适中 复杂
性能 轻量级,pro版本支持负载均衡,以及缓存功能 轻量级(NIO模式性能高些),支持负载均衡
多实例 略麻烦 比较简单
集群部署 支持 支持
是否支持php 新版本支持(但很少用) 默认不支持(可配置)
用户喜好 略少 略多
常用组合 Nginx+Resin or+其它 Nginx+tomcat+or其它
报错机制 简单 复杂
标准|开发|行为喜好 两者在标准支持,开发使用,用户喜好有很大关系

常用JavaEE容器有很多:Tomcat、Resin、JBoss、Glassfish ,注意weblogic属于应用服务器。

1、安装配置Resin:
1.1)jdk目录创建:

1
2
3
4
5
6
7
8
[root@resin ~]# tar zxvfjdk-7u75-linux-x64.tar.gz
[root@resin ~]# mkdir/usr/local/jdk1.7
[root@resin ~]# mvjdk1.7.0_75/* /usr/local/jdk1.7/
[root@resin ~]# cat/etc/profile.d/jdk.sh
export JAVA_HOME=/usr/local/jdk1.7/
exportCLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
exportPATH=$PATH:$JAVA_HOME/bin
export JAVA_HOMECLASSPATH PATH

1.2)Resin安装配置:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
[root@resin ~]# yum install ntpdate -y
[root@resin~]# ntpdate time.windows.com安装resin
[root@resin~]# tar xf resin-4.0.50.tar.gz -C /usr/local/
[root@resin~]# cd /usr/local
[root@resin~]# #./configure --prefix=/usr/local/resin
[root@resin~]# #make
[root@resin ~]# #make install
[root@resin local]# ln -s resin-4.0.50 resin
[root@resin local]#cat / etc/profile.d/resin.sh
exportRESIN_HOME=/usr/local/resin
[root@resin local]#
[root@resin local]#cp /usr/local/resin/bin/resin.sh /etc/init.d/resin
[root@resin local]#chmod +x /etc/init.d/resin
[root@resin local]#/etc/init.d/resin start

1.3)首页访问:
HTTP://IP:8080

1.4)配置多个项目:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
[root@resin ~]# cd /usr/local/resin/conf
[root@resin conf]# vim resin.xml
#配置多个项目:
<clusterid="app1">
<!-- define the servers in the cluster -->
<server-multiid-prefix="app1"address-list="${app1_servers}"port="6800"/> //端口1
<!-- the default host, matching any host name -->
<hostid=""root-directory=".">
<web-appid="/"root-directory="/usr/local/resin/webapps/app1"/> //项目1
</host>
</cluster>
<clusterid="app2">
<!-- define the servers in the cluster -->
<server-multiid-prefix="app2"address-list="${app2_servers}"port="6801"/> //端口2
<!-- the default host, matching any host name -->
<hostid=""root-directory=".">
<web-appid="/"root-directory="/usr/local/resin/webapps/app2"/> //项目2
</host>
</cluster>

1.4.1)定义端口:

1
2
3
4
5
# app-tier Triad servers: app-0 app-1 app-2 
app1_servers : 127.0.0.1:6800
app2_servers : 127.0.0.1:6801
app1.http : 8080
app2.http : 8081

1.5)JDBC配置:

1
2
3
4
5
6
7
8
9
10
11
<database>
<jndi-name>jdbc/test</jndi-name>
<driver type="com.microsoft.jdbc.sqlserver.SQLServerDriver">
<url>jdbc:microsoft:sqlserver://localhost:3306;databasename=Northwind</url> //后端数据库
<user>sa</user>
<password>password</password> //密码
</driver>
<prepared-statement-cache-size>8</prepared-statement-cache-size>
<max-connections>20</max-connections>
<max-idle-time>30s</max-idle-time>
</database>

注意:jdbc文件可自己定义,需要导入相应的驱动包。

2、安装配置Nginx:

1
2
3
4
useradd nginx -M -s /sbin/nologin
tar xf nginx-1.9.2.tar.gz
cd nginx-1.9.2
./configure --user=nginx --group=nginx --prefix=/usr/local/nginx --with-http_stub_status_module--with-http_ssl_module --with-http_realip_module --with-http_flv_module --with-http_mp4_module --with-http_gzip_static_module&&make &&make install

2.1)nginx.conf配置负载均衡:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
user  nginx; 

worker_processes 8;

#worker_cpu_affinity 00000001 00000010 00000100 00001000 00010000 00100000 01000000 10000000;

error_log logs/error.log info;

pid /var/run/nginx.pid;

events {
use epoll;
worker_connections 1024;
}

http {
include mime.types;

default_type application/octet-stream;

charset UTF-8;

server_names_hash_bucket_size 128;

client_header_buffer_size 32k;

large_client_header_buffers 4 32k;

client_max_body_size 8m;

#limit_conn_zone $binary_remote_addr zone=one:32k;
#limit_conn_zone $binary_remote_addr zone=permitip:10m;

error_page 404 = http://www.jb51.net/404.html;

#error_page 404 = /40x.html;
#location = /40x.html{
#root html;
#}

#error_page 500 502 503 504 /50x.html;
#location = /50x.html {
#root html;
#}

open_file_cache max=102400 inactive=20s;

sendfile on;

#autoindex on;

tcp_nopush on;
tcp_nodelay on;

keepalive_timeout 60;

gzip on;
gzip_min_length 1k;
gzip_buffers 4 16k;
gzip_http_version 1.0;
gzip_comp_level 2;
#gzip_types text/plain application/x-javascript text/css application/xml;
gzip_vary on;

fastcgi_connect_timeout 300;
fastcgi_send_timeout 300;
fastcgi_read_timeout 300;
fastcgi_buffer_size 64k;
fastcgi_buffers 4 64k;
fastcgi_busy_buffers_size 128k;
fastcgi_temp_file_write_size 128k;

#如果要启用负载均衡
#upstream www.xxx.com {
#zone myapp1 64k;
#server 192.168.1.220:80 weight=1 max_fails=2 fail_timeout=30s slow_start=30s;
#server 192.168.1.221:80 weight=1 max_fails=2 fail_timeout=30s;
#}

#upstream www.xxx.org {
#zone myapp1 64k;
#server 192.168.1.220:80 weight=1 max_fails=2 fail_timeout=30s slow_start=30s;
#server 192.168.1.221:80 weight=1 max_fails=2 fail_timeout=30s;
#}
log_format access '$remote_addr - $remote_user [$time_local] $request $status $body_bytes_sent $http_referer $http_user_agent $http_x_forwarded_for';

#access_log logs/access.log access;

include vhost/*.conf;
}

2.2)renzhiyuan.conf配置动静分离:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
server {
listen 80;
server_name
#路径根据 Resin定义路径配置,这里根据默认
root /usr/local/resin/webapps/ROOT;
index index.html index.php index.jsp index.html;

#location ~ \.php$ {
# root html;
# fastcgi_pass 127.0.0.1:9000;
# fastcgi_index index.php;
# include fastcgi.conf;
# }

location ~ .(jsp|jspx|do)?$ {
proxy_set_header Host $host;
proxy_pass http://127.0.0.1:8080;
proxy_redirect off;

proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header Host $host;
client_max_body_size 10m;
client_body_buffer_size 128k;
proxy_connect_timeout 90;
proxy_send_timeout 90;
proxy_read_timeout 90;
proxy_buffer_size 4k;
proxy_buffers 4 32k;
proxy_busy_buffers_size 64k;
proxy_temp_file_write_size 64k;
}

location ~ .*\.(htm|html|gif|jpg|jpeg|png|bmp|swf|ioc|rar|zip|txt|flv|mid|doc|ppt|pdf|xls|mp3|wma)$ {
expires 30d;
}

location ~ .*\.(js|css)?$ {
expires 12h;
}
}
,