PERF: ability to crawl for titles without extra HEAD req

Also, introduces a much more aggressive timeout for title crawling and introduces gzip to body that is crawled
2025-09-05 08:59:27 +08:00 · 2018-01-29 15:36:52 +11:00 · 2018-01-29 15:36:52 +11:00 · fa5880e04f
commit fa5880e04f
parent 1f6adbea5c
4 changed files with 176 additions and 26 deletions
--- a/spec/components/final_destination_spec.rb
+++ b/spec/components/final_destination_spec.rb
@ -207,6 +207,32 @@ describe FinalDestination do
    end
  end

+  describe '.get' do
+
+    it "can correctly stream with a redirect" do
+
+      FinalDestination.clear_https_cache!("wikipedia.com")
+
+      stub_request(:get, "http://wikipedia.com/").
+        to_return(status: 302, body: "" , headers: { "location" => "https://wikipedia.com/" })
+
+      # webmock does not do chunks
+      stub_request(:get, "https://wikipedia.com/").
+        to_return(status: 200, body: "<html><head>" , headers: {})
+
+      result = nil
+      chunk = nil
+
+      result = FinalDestination.new("http://wikipedia.com", opts).get do |resp, c|
+        chunk = c
+        throw :done
+      end
+
+      expect(result).to eq("https://wikipedia.com/")
+      expect(chunk).to eq("<html><head>")
+    end
+  end
+
  describe '.validate_uri' do
    context "host lookups" do
      it "works for various hosts" do
--- a/spec/components/retrieve_title_spec.rb
+++ b/spec/components/retrieve_title_spec.rb
@ -54,7 +54,18 @@ describe RetrieveTitle do
      )
      expect(title).to eq("Video Title")
    end
+  end

+  context "crawl" do
+    it "can properly extract a title from a url" do
+      stub_request(:get, "https://brelksdjflaskfj.com/amazing")
+        .to_return(status: 200, body: "<html><title>very amazing</title>")
+
+      # we still resolve the IP address for every host
+      IPSocket.stubs(:getaddress).returns('100.2.3.4')
+
+      expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("very amazing")
+    end
  end

 end